From 7cf809bb4b7401fbb9225f6bd54cca7dc2e9b968 Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 10:57:15 -0300 Subject: [PATCH 01/16] feat(enterprise): add full/lite nats control-plane package with nestjs services --- README.md | 24 ++- ansible.cfg | 10 + control-plane/.dockerignore | 4 + control-plane/Dockerfile | 21 ++ control-plane/README.md | 32 +++ control-plane/package.json | 32 +++ control-plane/sql/001_init.sql | 29 +++ control-plane/src/broker/broker.module.ts | 8 + control-plane/src/broker/broker.runner.ts | 146 ++++++++++++++ control-plane/src/broker/main.ts | 14 ++ control-plane/src/common/config.ts | 36 ++++ control-plane/src/common/contracts.ts | 59 ++++++ control-plane/src/common/intents.ts | 35 ++++ control-plane/src/common/metrics.ts | 49 +++++ control-plane/src/common/nats.ts | 55 ++++++ control-plane/src/common/postgres.ts | 39 ++++ .../src/control-api/control.controller.ts | 46 +++++ .../src/control-api/control.module.ts | 10 + .../src/control-api/control.service.ts | 91 +++++++++ control-plane/src/control-api/main.ts | 28 +++ .../src/ingress/ingress.controller.ts | 36 ++++ control-plane/src/ingress/ingress.module.ts | 10 + control-plane/src/ingress/ingress.service.ts | 68 +++++++ control-plane/src/ingress/main.ts | 29 +++ control-plane/src/router/main.ts | 14 ++ control-plane/src/router/router.module.ts | 8 + control-plane/src/router/router.runner.ts | 73 +++++++ control-plane/src/worker/main.ts | 14 ++ control-plane/src/worker/worker.module.ts | 8 + control-plane/src/worker/worker.runner.ts | 87 +++++++++ control-plane/tsconfig.build.json | 7 + control-plane/tsconfig.json | 22 +++ docs/control-plane-stage2.md | 73 +++++++ docs/enterprise-deployment.md | 138 +++++++++++++ docs/epics/EPIC-002-control-plane-stage2.md | 32 +++ docs/evidence/STAGE2-validation.md | 13 ++ .../stories/STORY-201-full-stack-efra-core.md | 14 ++ docs/stories/STORY-202-lite-stack-andrea.md | 14 ++ ...RY-203-intent-routing-and-result-broker.md | 16 ++ ...STORY-204-observability-and-control-api.md | 16 ++ inventories/README.md | 75 +++++++ inventories/dev/group_vars/all.yml | 97 ++++++++++ inventories/dev/group_vars/vault.example.yml | 16 ++ inventories/dev/host_vars/.gitkeep | 0 inventories/dev/hosts.yml | 29 +++ inventories/prod/group_vars/all.yml | 82 ++++++++ inventories/prod/group_vars/vault.example.yml | 5 + inventories/prod/host_vars/.gitkeep | 0 inventories/prod/hosts.yml | 16 ++ inventories/research/group_vars/all.yml | 41 ++++ .../research/group_vars/vault.example.yml | 4 + inventories/research/host_vars/.gitkeep | 0 inventories/research/hosts.yml | 13 ++ inventories/staging/group_vars/all.yml | 41 ++++ .../staging/group_vars/vault.example.yml | 4 + inventories/staging/host_vars/.gitkeep | 0 inventories/staging/hosts.yml | 13 ++ playbooks/enterprise.yml | 32 +++ .../openclaw_control_plane/defaults/main.yml | 40 ++++ roles/openclaw_control_plane/tasks/main.yml | 70 +++++++ .../openclaw_control_plane/tasks/profile.yml | 91 +++++++++ .../templates/control-plane.env.j2 | 13 ++ .../templates/docker-compose.full.yml.j2 | 183 ++++++++++++++++++ .../templates/docker-compose.lite.yml.j2 | 95 +++++++++ .../templates/grafana-datasources.yml.j2 | 9 + .../templates/prometheus.yml.j2 | 41 ++++ roles/openclaw_enterprise/defaults/main.yml | 47 +++++ roles/openclaw_enterprise/handlers/main.yml | 14 ++ roles/openclaw_enterprise/tasks/main.yml | 152 +++++++++++++++ .../openclaw-gateway-profile.service.j2 | 34 ++++ .../templates/openclaw-profile.json.j2 | 58 ++++++ .../templates/profile.env.j2 | 10 + run-enterprise-playbook.sh | 23 +++ 73 files changed, 2806 insertions(+), 2 deletions(-) create mode 100644 ansible.cfg create mode 100644 control-plane/.dockerignore create mode 100644 control-plane/Dockerfile create mode 100644 control-plane/README.md create mode 100644 control-plane/package.json create mode 100644 control-plane/sql/001_init.sql create mode 100644 control-plane/src/broker/broker.module.ts create mode 100644 control-plane/src/broker/broker.runner.ts create mode 100644 control-plane/src/broker/main.ts create mode 100644 control-plane/src/common/config.ts create mode 100644 control-plane/src/common/contracts.ts create mode 100644 control-plane/src/common/intents.ts create mode 100644 control-plane/src/common/metrics.ts create mode 100644 control-plane/src/common/nats.ts create mode 100644 control-plane/src/common/postgres.ts create mode 100644 control-plane/src/control-api/control.controller.ts create mode 100644 control-plane/src/control-api/control.module.ts create mode 100644 control-plane/src/control-api/control.service.ts create mode 100644 control-plane/src/control-api/main.ts create mode 100644 control-plane/src/ingress/ingress.controller.ts create mode 100644 control-plane/src/ingress/ingress.module.ts create mode 100644 control-plane/src/ingress/ingress.service.ts create mode 100644 control-plane/src/ingress/main.ts create mode 100644 control-plane/src/router/main.ts create mode 100644 control-plane/src/router/router.module.ts create mode 100644 control-plane/src/router/router.runner.ts create mode 100644 control-plane/src/worker/main.ts create mode 100644 control-plane/src/worker/worker.module.ts create mode 100644 control-plane/src/worker/worker.runner.ts create mode 100644 control-plane/tsconfig.build.json create mode 100644 control-plane/tsconfig.json create mode 100644 docs/control-plane-stage2.md create mode 100644 docs/enterprise-deployment.md create mode 100644 docs/epics/EPIC-002-control-plane-stage2.md create mode 100644 docs/evidence/STAGE2-validation.md create mode 100644 docs/stories/STORY-201-full-stack-efra-core.md create mode 100644 docs/stories/STORY-202-lite-stack-andrea.md create mode 100644 docs/stories/STORY-203-intent-routing-and-result-broker.md create mode 100644 docs/stories/STORY-204-observability-and-control-api.md create mode 100644 inventories/README.md create mode 100644 inventories/dev/group_vars/all.yml create mode 100644 inventories/dev/group_vars/vault.example.yml create mode 100644 inventories/dev/host_vars/.gitkeep create mode 100644 inventories/dev/hosts.yml create mode 100644 inventories/prod/group_vars/all.yml create mode 100644 inventories/prod/group_vars/vault.example.yml create mode 100644 inventories/prod/host_vars/.gitkeep create mode 100644 inventories/prod/hosts.yml create mode 100644 inventories/research/group_vars/all.yml create mode 100644 inventories/research/group_vars/vault.example.yml create mode 100644 inventories/research/host_vars/.gitkeep create mode 100644 inventories/research/hosts.yml create mode 100644 inventories/staging/group_vars/all.yml create mode 100644 inventories/staging/group_vars/vault.example.yml create mode 100644 inventories/staging/host_vars/.gitkeep create mode 100644 inventories/staging/hosts.yml create mode 100644 playbooks/enterprise.yml create mode 100644 roles/openclaw_control_plane/defaults/main.yml create mode 100644 roles/openclaw_control_plane/tasks/main.yml create mode 100644 roles/openclaw_control_plane/tasks/profile.yml create mode 100644 roles/openclaw_control_plane/templates/control-plane.env.j2 create mode 100644 roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 create mode 100644 roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2 create mode 100644 roles/openclaw_control_plane/templates/grafana-datasources.yml.j2 create mode 100644 roles/openclaw_control_plane/templates/prometheus.yml.j2 create mode 100644 roles/openclaw_enterprise/defaults/main.yml create mode 100644 roles/openclaw_enterprise/handlers/main.yml create mode 100644 roles/openclaw_enterprise/tasks/main.yml create mode 100644 roles/openclaw_enterprise/templates/openclaw-gateway-profile.service.j2 create mode 100644 roles/openclaw_enterprise/templates/openclaw-profile.json.j2 create mode 100644 roles/openclaw_enterprise/templates/profile.env.j2 create mode 100755 run-enterprise-playbook.sh diff --git a/README.md b/README.md index 8bf4903..d4bd7ca 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Lint](https://github.com/openclaw/openclaw-ansible/actions/workflows/lint.yml/badge.svg)](https://github.com/openclaw/openclaw-ansible/actions/workflows/lint.yml) [![Ansible](https://img.shields.io/badge/Ansible-2.14+-blue.svg)](https://www.ansible.com/) -[![Multi-OS](https://img.shields.io/badge/OS-Debian%20%7C%20Ubuntu-orange.svg)](https://www.debian.org/) +[![Multi-OS](https://img.shields.io/badge/OS-Debian%20%7C%20Ubuntu%20%7C%20Fedora-orange.svg)](https://www.debian.org/) Automated, hardened installation of [OpenClaw](https://github.com/openclaw/openclaw) with Docker and Tailscale VPN support for Debian/Ubuntu Linux. @@ -152,12 +152,14 @@ ansible-playbook playbook.yml --ask-become-pass - [Development Mode](docs/development-mode.md) - Build from source - [Security Architecture](docs/security.md) - Security details - [Technical Details](docs/architecture.md) - Architecture overview +- [Enterprise Deployment](docs/enterprise-deployment.md) - Multi-profile deployment +- [Stage 2 Control Plane](docs/control-plane-stage2.md) - NATS + NestJS full/lite package - [Troubleshooting](docs/troubleshooting.md) - Common issues - [Agent Guidelines](AGENTS.md) - AI agent instructions ## Requirements -- Debian 11+ or Ubuntu 20.04+ +- Debian 11+ or Ubuntu 20.04+ or Fedora 40+ - Root/sudo access - Internet connection @@ -291,3 +293,21 @@ MIT - see [LICENSE](LICENSE) - OpenClaw: https://github.com/openclaw/openclaw - This installer: https://github.com/openclaw/openclaw-ansible/issues + +## Enterprise Multi-Environment Deployment + +For multi-node environments with profile isolation and model/provider routing, use: + +- `playbooks/enterprise.yml` +- `inventories/dev|staging|prod|research` +- `roles/openclaw_enterprise` +- `run-enterprise-playbook.sh` + +Guide: `docs/enterprise-deployment.md` + +Android companion nodes are supported in this topology as gateway-paired WS nodes +(inventory metadata group: `openclaw_mobile_nodes`; operational flow via +`openclaw nodes pending|approve|status` on a gateway host). + +A dedicated `browser-login` agent is also included in enterprise profile examples: +browser-only tool policy and `openclaw` managed browser profile for manual login flows. diff --git a/ansible.cfg b/ansible.cfg new file mode 100644 index 0000000..432de83 --- /dev/null +++ b/ansible.cfg @@ -0,0 +1,10 @@ +[defaults] +roles_path = ./roles +inventory = ./inventories/dev/hosts.yml +interpreter_python = auto_silent +host_key_checking = False +retry_files_enabled = False +stdout_callback = yaml + +[privilege_escalation] +become = True diff --git a/control-plane/.dockerignore b/control-plane/.dockerignore new file mode 100644 index 0000000..840fb73 --- /dev/null +++ b/control-plane/.dockerignore @@ -0,0 +1,4 @@ +node_modules +dist +.git +npm-debug.log diff --git a/control-plane/Dockerfile b/control-plane/Dockerfile new file mode 100644 index 0000000..e40cbb9 --- /dev/null +++ b/control-plane/Dockerfile @@ -0,0 +1,21 @@ +FROM node:22-bookworm-slim AS deps +WORKDIR /app +COPY package.json ./ +RUN npm install --omit=optional + +FROM node:22-bookworm-slim AS build +WORKDIR /app +COPY --from=deps /app/node_modules ./node_modules +COPY tsconfig.json tsconfig.build.json ./ +COPY src ./src +RUN npm run build + +FROM node:22-bookworm-slim AS runtime +WORKDIR /app +ENV NODE_ENV=production +COPY --from=deps /app/node_modules ./node_modules +COPY --from=build /app/dist ./dist +COPY sql ./sql +ARG SERVICE=ingress +ENV SERVICE_NAME=${SERVICE} +CMD ["sh", "-c", "node dist/${SERVICE_NAME}/main.js"] diff --git a/control-plane/README.md b/control-plane/README.md new file mode 100644 index 0000000..3d31f83 --- /dev/null +++ b/control-plane/README.md @@ -0,0 +1,32 @@ +# OpenClaw Control Plane (Stage 2) + +NestJS microservices + NATS JetStream control plane for multi-agent routing: + +- `ingress`: receives Telegram/API payloads and emits `tasks.ingress` +- `router`: classifies intent and routes to `tasks.agent.` +- `worker`: executes per agent and emits `results.agent.` +- `broker`: persists results and optionally replies to Telegram +- `control-api`: task state, queue stats, confirm/reject actions + +## Run locally + +```bash +npm install +npm run build +npm run start:ingress +npm run start:router +npm run start:worker +npm run start:broker +npm run start:control-api +``` + +Environment variables: + +- `OPENCLAW_PROFILE` +- `NATS_URL` +- `NATS_STREAM` +- `POSTGRES_URL` +- `TELEGRAM_BOT_TOKEN` (optional) +- `TELEGRAM_DEFAULT_CHAT_ID` (optional) +- `ROUTER_FORCED_AGENT` (optional) +- `WORKER_AGENT_ID` (for worker service) diff --git a/control-plane/package.json b/control-plane/package.json new file mode 100644 index 0000000..6d6a671 --- /dev/null +++ b/control-plane/package.json @@ -0,0 +1,32 @@ +{ + "name": "openclaw-control-plane", + "version": "0.1.0", + "private": true, + "description": "NestJS microservices control plane for OpenClaw profiles", + "scripts": { + "build": "tsc -p tsconfig.build.json", + "start:ingress": "node dist/ingress/main.js", + "start:router": "node dist/router/main.js", + "start:worker": "node dist/worker/main.js", + "start:broker": "node dist/broker/main.js", + "start:control-api": "node dist/control-api/main.js", + "lint": "tsc -p tsconfig.build.json --noEmit" + }, + "dependencies": { + "@nestjs/common": "^10.4.2", + "@nestjs/core": "^10.4.2", + "@nestjs/platform-express": "^10.4.2", + "@nestjs/config": "^3.2.3", + "class-transformer": "^0.5.1", + "class-validator": "^0.14.1", + "nats": "^2.29.3", + "pg": "^8.13.1", + "prom-client": "^15.1.3", + "reflect-metadata": "^0.2.2", + "rxjs": "^7.8.1" + }, + "devDependencies": { + "@types/node": "^22.13.8", + "typescript": "^5.7.3" + } +} diff --git a/control-plane/sql/001_init.sql b/control-plane/sql/001_init.sql new file mode 100644 index 0000000..43b8b10 --- /dev/null +++ b/control-plane/sql/001_init.sql @@ -0,0 +1,29 @@ +CREATE TABLE IF NOT EXISTS tasks ( + task_id TEXT PRIMARY KEY, + profile TEXT NOT NULL, + source_channel TEXT, + chat_id TEXT, + user_id TEXT, + intent TEXT, + target_agent TEXT, + status TEXT NOT NULL, + needs_confirmation BOOLEAN NOT NULL DEFAULT FALSE, + text_payload TEXT, + summary TEXT, + result_payload JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS task_events ( + id BIGSERIAL PRIMARY KEY, + task_id TEXT NOT NULL, + profile TEXT NOT NULL, + event_type TEXT NOT NULL, + from_agent TEXT, + payload JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_tasks_updated_at ON tasks(updated_at DESC); +CREATE INDEX IF NOT EXISTS idx_events_task_id ON task_events(task_id); diff --git a/control-plane/src/broker/broker.module.ts b/control-plane/src/broker/broker.module.ts new file mode 100644 index 0000000..852faa9 --- /dev/null +++ b/control-plane/src/broker/broker.module.ts @@ -0,0 +1,8 @@ +import { Module } from '@nestjs/common'; + +import { BrokerRunner } from './broker.runner'; + +@Module({ + providers: [BrokerRunner] +}) +export class BrokerModule {} diff --git a/control-plane/src/broker/broker.runner.ts b/control-plane/src/broker/broker.runner.ts new file mode 100644 index 0000000..bb69cc1 --- /dev/null +++ b/control-plane/src/broker/broker.runner.ts @@ -0,0 +1,146 @@ +import { Injectable, Logger, OnModuleDestroy, OnModuleInit } from '@nestjs/common'; +import type { ConsumerMessages, NatsConnection } from 'nats'; +import type { Pool } from 'pg'; + +import { loadConfig } from '../common/config'; +import { type TaskResult } from '../common/contracts'; +import { type ServiceMetrics, initMetrics, startMetricsServer } from '../common/metrics'; +import { connectNats, decodeJson, ensureConsumer, ensureStream } from '../common/nats'; +import { createPgPool, migrate } from '../common/postgres'; + +@Injectable() +export class BrokerRunner implements OnModuleInit, OnModuleDestroy { + private readonly logger = new Logger(BrokerRunner.name); + private readonly cfg = loadConfig('broker'); + private nc: NatsConnection | null = null; + private pg: Pool | null = null; + private messages: ConsumerMessages | null = null; + private metrics: ServiceMetrics | null = null; + private metricsServer: ReturnType | null = null; + + async onModuleInit(): Promise { + this.metrics = initMetrics('broker'); + this.metricsServer = startMetricsServer(this.cfg.metricsPort, this.metrics.registry); + + this.pg = createPgPool(this.cfg.pgUrl); + await migrate(this.pg); + + this.nc = await connectNats(this.cfg.natsUrl); + await ensureStream(this.nc, this.cfg.natsStream); + const consumer = await ensureConsumer(this.nc, this.cfg.natsStream, `${this.cfg.profile}-broker`, 'results.agent.*'); + this.messages = await consumer.consume(); + + this.run().catch((error: unknown) => this.logger.error(`Broker loop failed: ${String(error)}`)); + + this.logger.log('Broker started'); + } + + async onModuleDestroy(): Promise { + this.messages?.close(); + await this.nc?.drain(); + await this.pg?.end(); + this.metricsServer?.close(); + } + + private async run(): Promise { + if (!this.messages || !this.nc || !this.pg) { + throw new Error('Broker is not initialized'); + } + + for await (const msg of this.messages) { + try { + const result = decodeJson(msg.data); + await this.persist(result); + await this.maybeSendTelegram(result); + + this.metrics?.handledMessages.inc(); + msg.ack(); + } catch (error) { + this.metrics?.failedMessages.inc(); + msg.nak(); + this.logger.error(`Broker failed to process result: ${String(error)}`); + } + } + } + + private async persist(result: TaskResult): Promise { + if (!this.pg) { + return; + } + + await this.pg.query( + ` + INSERT INTO tasks ( + task_id, profile, source_channel, chat_id, user_id, intent, target_agent, + status, needs_confirmation, summary, result_payload, updated_at + ) VALUES ($1, $2, $3, $4, $5, NULL, $6, $7, $8, $9, $10::jsonb, NOW()) + ON CONFLICT (task_id) + DO UPDATE SET + status = EXCLUDED.status, + needs_confirmation = EXCLUDED.needs_confirmation, + summary = EXCLUDED.summary, + result_payload = EXCLUDED.result_payload, + target_agent = EXCLUDED.target_agent, + updated_at = NOW(); + `, + [ + result.taskId, + result.profile, + result.source?.channel ?? 'telegram', + result.source?.chatId ?? null, + result.source?.userId ?? null, + result.fromAgent, + result.status, + result.needsConfirmation, + result.summary, + JSON.stringify(result) + ] + ); + + await this.pg.query( + ` + INSERT INTO task_events (task_id, profile, event_type, from_agent, payload) + VALUES ($1, $2, $3, $4, $5::jsonb) + `, + [result.taskId, result.profile, 'result', result.fromAgent, JSON.stringify(result)] + ); + } + + private async maybeSendTelegram(result: TaskResult): Promise { + if (!this.cfg.telegramBotToken) { + return; + } + + const chatId = result.source?.chatId ?? this.cfg.telegramDefaultChatId; + if (!chatId) { + return; + } + + const lines = [ + `[agent=${result.fromAgent}] [task=${result.taskId}]`, + result.summary, + result.fullResponse + ]; + + if (result.needsConfirmation) { + lines.push(`Accion pendiente. Responde: confirmar ${result.taskId} o rechazar ${result.taskId}`); + } + + const response = await fetch( + `https://api.telegram.org/bot${this.cfg.telegramBotToken}/sendMessage`, + { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + chat_id: chatId, + text: lines.join('\n') + }) + } + ); + + if (!response.ok) { + const body = await response.text(); + this.logger.warn(`Telegram send failed: ${response.status} ${body}`); + } + } +} diff --git a/control-plane/src/broker/main.ts b/control-plane/src/broker/main.ts new file mode 100644 index 0000000..cdc5d22 --- /dev/null +++ b/control-plane/src/broker/main.ts @@ -0,0 +1,14 @@ +import 'reflect-metadata'; +import { NestFactory } from '@nestjs/core'; + +import { BrokerModule } from './broker.module'; + +async function bootstrap(): Promise { + await NestFactory.createApplicationContext(BrokerModule, { bufferLogs: true }); +} + +bootstrap().catch((error: unknown) => { + // eslint-disable-next-line no-console + console.error(error); + process.exit(1); +}); diff --git a/control-plane/src/common/config.ts b/control-plane/src/common/config.ts new file mode 100644 index 0000000..e116be9 --- /dev/null +++ b/control-plane/src/common/config.ts @@ -0,0 +1,36 @@ +export interface AppConfig { + serviceName: string; + profile: string; + natsUrl: string; + natsStream: string; + metricsPort: number; + pgUrl: string; + telegramBotToken: string; + telegramDefaultChatId: string; + routerForcedAgent: string; + workerAgentId: string; +} + +function intFromEnv(name: string, fallback: number): number { + const value = process.env[name]; + if (!value) { + return fallback; + } + const parsed = Number.parseInt(value, 10); + return Number.isNaN(parsed) ? fallback : parsed; +} + +export function loadConfig(serviceName: string): AppConfig { + return { + serviceName, + profile: process.env.OPENCLAW_PROFILE ?? 'efra-core', + natsUrl: process.env.NATS_URL ?? 'nats://nats:4222', + natsStream: process.env.NATS_STREAM ?? 'OPENCLAW_TASKS', + metricsPort: intFromEnv('METRICS_PORT', 9400), + pgUrl: process.env.POSTGRES_URL ?? 'postgres://openclaw:openclaw@postgres:5432/openclaw_control', + telegramBotToken: process.env.TELEGRAM_BOT_TOKEN ?? '', + telegramDefaultChatId: process.env.TELEGRAM_DEFAULT_CHAT_ID ?? '', + routerForcedAgent: process.env.ROUTER_FORCED_AGENT ?? '', + workerAgentId: process.env.WORKER_AGENT_ID ?? 'main' + }; +} diff --git a/control-plane/src/common/contracts.ts b/control-plane/src/common/contracts.ts new file mode 100644 index 0000000..59814bc --- /dev/null +++ b/control-plane/src/common/contracts.ts @@ -0,0 +1,59 @@ +export type TaskStatus = + | 'NEW' + | 'ROUTED' + | 'RUNNING' + | 'WAITING_CONFIRMATION' + | 'DONE' + | 'FAILED' + | 'DEAD_LETTER'; + +export interface TaskEnvelope { + taskId: string; + profile: string; + source: { + channel: 'telegram' | 'api' | 'system'; + chatId?: string; + userId?: string; + username?: string; + }; + text: string; + intent?: string; + targetAgent?: string; + priority?: number; + budgetTokens?: number; + status: TaskStatus; + metadata?: Record; + createdAt: string; +} + +export interface TaskResult { + taskId: string; + profile: string; + fromAgent: string; + status: Extract; + summary: string; + fullResponse: string; + needsConfirmation: boolean; + suggestedAction?: string; + tokenUsage?: number; + costEstimate?: number; + source?: TaskEnvelope['source']; + createdAt: string; +} + +export interface ConfirmationCommand { + taskId: string; + profile: string; + decision: 'confirm' | 'reject'; + note?: string; + actor: string; + createdAt: string; +} + +export interface QueueStats { + stream: string; + messages: number; + bytes: number; + firstSeq: number; + lastSeq: number; +} diff --git a/control-plane/src/common/intents.ts b/control-plane/src/common/intents.ts new file mode 100644 index 0000000..df827b5 --- /dev/null +++ b/control-plane/src/common/intents.ts @@ -0,0 +1,35 @@ +const ROUTING_RULES: Array<{ intent: string; agent: string; keywords: string[] }> = [ + { + intent: 'browser.login', + agent: 'browser-login', + keywords: ['login', 'browser', 'portal', 'cookie', 'captcha'] + }, + { + intent: 'deploy.coolify', + agent: 'coolify-ops', + keywords: ['coolify', 'deploy', 'release', 'rollback', 'service up', 'service down'] + }, + { + intent: 'research.analysis', + agent: 'research', + keywords: ['investiga', 'analiza', 'research', 'comparar', 'resumen', 'benchmark'] + } +]; + +export function classifyIntent(text: string): { intent: string; targetAgent: string } { + const lowered = text.toLowerCase(); + for (const rule of ROUTING_RULES) { + if (rule.keywords.some((word) => lowered.includes(word))) { + return { intent: rule.intent, targetAgent: rule.agent }; + } + } + + return { intent: 'general.main', targetAgent: 'main' }; +} + +export function actionNeedsConfirmation(text: string): boolean { + const lowered = text.toLowerCase(); + return ['delete', 'drop', 'destroy', 'stop', 'down', 'wipe', 'rm -rf', 'shutdown'].some((token) => + lowered.includes(token) + ); +} diff --git a/control-plane/src/common/metrics.ts b/control-plane/src/common/metrics.ts new file mode 100644 index 0000000..0b4ec8f --- /dev/null +++ b/control-plane/src/common/metrics.ts @@ -0,0 +1,49 @@ +import http from 'node:http'; +import { Counter, Registry, collectDefaultMetrics } from 'prom-client'; + +export interface ServiceMetrics { + registry: Registry; + handledMessages: Counter; + failedMessages: Counter; +} + +export function initMetrics(service: string): ServiceMetrics { + const registry = new Registry(); + collectDefaultMetrics({ register: registry, prefix: `${service}_` }); + + const handledMessages = new Counter({ + name: `${service}_handled_messages_total`, + help: `Handled messages by ${service}`, + registers: [registry] + }); + + const failedMessages = new Counter({ + name: `${service}_failed_messages_total`, + help: `Failed messages by ${service}`, + registers: [registry] + }); + + return { registry, handledMessages, failedMessages }; +} + +export function startMetricsServer(port: number, registry: Registry): http.Server { + const server = http.createServer(async (_req, res) => { + if (_req.url === '/health') { + res.statusCode = 200; + res.end('ok'); + return; + } + + if (_req.url === '/metrics') { + res.setHeader('Content-Type', registry.contentType); + res.end(await registry.metrics()); + return; + } + + res.statusCode = 404; + res.end('not found'); + }); + + server.listen(port, '0.0.0.0'); + return server; +} diff --git a/control-plane/src/common/nats.ts b/control-plane/src/common/nats.ts new file mode 100644 index 0000000..4176d90 --- /dev/null +++ b/control-plane/src/common/nats.ts @@ -0,0 +1,55 @@ +import { StringCodec, connect, type Consumer, type NatsConnection } from 'nats'; + +const sc = StringCodec(); + +export async function connectNats(servers: string): Promise { + return connect({ servers: servers.split(',').map((v) => v.trim()) }); +} + +export async function ensureStream(nc: NatsConnection, streamName: string): Promise { + const jsm = await nc.jetstreamManager(); + + try { + await jsm.streams.info(streamName); + } catch { + await jsm.streams.add({ + name: streamName, + subjects: ['tasks.>', 'results.>', 'control.>'], + retention: 'limits', + max_age: 7 * 24 * 60 * 60 * 1_000_000_000 + }); + } +} + +export async function ensureConsumer( + nc: NatsConnection, + streamName: string, + durableName: string, + filterSubject: string +): Promise { + const jsm = await nc.jetstreamManager(); + + try { + await jsm.consumers.info(streamName, durableName); + } catch { + await jsm.consumers.add(streamName, { + durable_name: durableName, + ack_policy: 'explicit', + deliver_policy: 'all', + filter_subject: filterSubject, + max_ack_pending: 200, + replay_policy: 'instant' + }); + } + + const js = nc.jetstream(); + return js.consumers.get(streamName, durableName); +} + +export function encodeJson(payload: unknown): Uint8Array { + return sc.encode(JSON.stringify(payload)); +} + +export function decodeJson(payload: Uint8Array): T { + return JSON.parse(sc.decode(payload)) as T; +} diff --git a/control-plane/src/common/postgres.ts b/control-plane/src/common/postgres.ts new file mode 100644 index 0000000..1b476cc --- /dev/null +++ b/control-plane/src/common/postgres.ts @@ -0,0 +1,39 @@ +import { Pool } from 'pg'; + +export function createPgPool(connectionString: string): Pool { + return new Pool({ connectionString }); +} + +export async function migrate(pool: Pool): Promise { + await pool.query(` + CREATE TABLE IF NOT EXISTS tasks ( + task_id TEXT PRIMARY KEY, + profile TEXT NOT NULL, + source_channel TEXT, + chat_id TEXT, + user_id TEXT, + intent TEXT, + target_agent TEXT, + status TEXT NOT NULL, + needs_confirmation BOOLEAN NOT NULL DEFAULT FALSE, + text_payload TEXT, + summary TEXT, + result_payload JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ); + + CREATE TABLE IF NOT EXISTS task_events ( + id BIGSERIAL PRIMARY KEY, + task_id TEXT NOT NULL, + profile TEXT NOT NULL, + event_type TEXT NOT NULL, + from_agent TEXT, + payload JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ); + + CREATE INDEX IF NOT EXISTS idx_tasks_updated_at ON tasks(updated_at DESC); + CREATE INDEX IF NOT EXISTS idx_events_task_id ON task_events(task_id); + `); +} diff --git a/control-plane/src/control-api/control.controller.ts b/control-plane/src/control-api/control.controller.ts new file mode 100644 index 0000000..cde1294 --- /dev/null +++ b/control-plane/src/control-api/control.controller.ts @@ -0,0 +1,46 @@ +import { Body, Controller, Get, Param, ParseIntPipe, Post, Query } from '@nestjs/common'; + +import { ControlService } from './control.service'; + +@Controller() +export class ControlController { + constructor(private readonly controlService: ControlService) {} + + @Get('/health') + health(): { ok: true } { + return { ok: true }; + } + + @Get('/tasks') + async tasks(@Query('limit', new ParseIntPipe({ optional: true })) limit?: number): Promise { + return this.controlService.listTasks(limit ?? 100); + } + + @Get('/tasks/:taskId') + async task(@Param('taskId') taskId: string): Promise { + return this.controlService.getTask(taskId); + } + + @Post('/tasks/:taskId/confirm') + async confirm( + @Param('taskId') taskId: string, + @Body() body: { actor?: string; note?: string } + ): Promise<{ ok: true }> { + await this.controlService.setDecision(taskId, 'confirm', body.actor ?? 'operator', body.note); + return { ok: true }; + } + + @Post('/tasks/:taskId/reject') + async reject( + @Param('taskId') taskId: string, + @Body() body: { actor?: string; note?: string } + ): Promise<{ ok: true }> { + await this.controlService.setDecision(taskId, 'reject', body.actor ?? 'operator', body.note); + return { ok: true }; + } + + @Get('/queues') + async queues(): Promise { + return this.controlService.queueStats(); + } +} diff --git a/control-plane/src/control-api/control.module.ts b/control-plane/src/control-api/control.module.ts new file mode 100644 index 0000000..a904782 --- /dev/null +++ b/control-plane/src/control-api/control.module.ts @@ -0,0 +1,10 @@ +import { Module } from '@nestjs/common'; + +import { ControlController } from './control.controller'; +import { ControlService } from './control.service'; + +@Module({ + controllers: [ControlController], + providers: [ControlService] +}) +export class ControlModule {} diff --git a/control-plane/src/control-api/control.service.ts b/control-plane/src/control-api/control.service.ts new file mode 100644 index 0000000..b9a81b9 --- /dev/null +++ b/control-plane/src/control-api/control.service.ts @@ -0,0 +1,91 @@ +import { Injectable, Logger, OnModuleDestroy, OnModuleInit } from '@nestjs/common'; +import type { NatsConnection } from 'nats'; +import type { Pool } from 'pg'; + +import { type ConfirmationCommand, type QueueStats } from '../common/contracts'; +import { loadConfig } from '../common/config'; +import { connectNats, encodeJson } from '../common/nats'; +import { createPgPool, migrate } from '../common/postgres'; + +@Injectable() +export class ControlService implements OnModuleInit, OnModuleDestroy { + private readonly logger = new Logger(ControlService.name); + private readonly cfg = loadConfig('control_api'); + private pg: Pool | null = null; + private nc: NatsConnection | null = null; + + async onModuleInit(): Promise { + this.pg = createPgPool(this.cfg.pgUrl); + await migrate(this.pg); + + this.nc = await connectNats(this.cfg.natsUrl); + this.logger.log('Control API ready'); + } + + async onModuleDestroy(): Promise { + await this.nc?.drain(); + await this.pg?.end(); + } + + async listTasks(limit = 100): Promise { + const safeLimit = Math.max(1, Math.min(limit, 500)); + const result = await this.pg?.query( + `SELECT task_id, profile, source_channel, chat_id, user_id, target_agent, status, needs_confirmation, summary, updated_at + FROM tasks + ORDER BY updated_at DESC + LIMIT $1`, + [safeLimit] + ); + + return result?.rows ?? []; + } + + async getTask(taskId: string): Promise { + const result = await this.pg?.query( + `SELECT task_id, profile, source_channel, chat_id, user_id, target_agent, status, needs_confirmation, summary, result_payload, updated_at + FROM tasks + WHERE task_id = $1`, + [taskId] + ); + + return result?.rows?.[0] ?? null; + } + + async setDecision(taskId: string, decision: 'confirm' | 'reject', actor: string, note?: string): Promise { + const command: ConfirmationCommand = { + taskId, + profile: this.cfg.profile, + decision, + note, + actor, + createdAt: new Date().toISOString() + }; + + this.nc?.publish(`control.${decision}.${this.cfg.profile}`, encodeJson(command)); + + await this.pg?.query( + ` + INSERT INTO task_events (task_id, profile, event_type, from_agent, payload) + VALUES ($1, $2, $3, $4, $5::jsonb) + `, + [taskId, this.cfg.profile, `decision_${decision}`, actor, JSON.stringify(command)] + ); + } + + async queueStats(): Promise { + if (!this.nc) { + return null; + } + + const jsm = await this.nc.jetstreamManager(); + const info = await jsm.streams.info(this.cfg.natsStream); + + return { + stream: info.config.name, + messages: info.state.messages, + bytes: info.state.bytes, + firstSeq: info.state.first_seq, + lastSeq: info.state.last_seq + }; + } +} diff --git a/control-plane/src/control-api/main.ts b/control-plane/src/control-api/main.ts new file mode 100644 index 0000000..940ae35 --- /dev/null +++ b/control-plane/src/control-api/main.ts @@ -0,0 +1,28 @@ +import 'reflect-metadata'; +import { Logger } from '@nestjs/common'; +import { NestFactory } from '@nestjs/core'; + +import { loadConfig } from '../common/config'; +import { initMetrics, startMetricsServer } from '../common/metrics'; +import { ControlModule } from './control.module'; + +async function bootstrap(): Promise { + const cfg = loadConfig('control_api'); + const logger = new Logger('ControlApiMain'); + + const app = await NestFactory.create(ControlModule, { bufferLogs: true }); + const port = Number.parseInt(process.env.HTTP_PORT ?? '39090', 10); + await app.listen(port, '0.0.0.0'); + + const metrics = initMetrics('control_api'); + startMetricsServer(cfg.metricsPort, metrics.registry); + + logger.log(`Control API listening on :${port}`); + logger.log(`Metrics listening on :${cfg.metricsPort}`); +} + +bootstrap().catch((error: unknown) => { + // eslint-disable-next-line no-console + console.error(error); + process.exit(1); +}); diff --git a/control-plane/src/ingress/ingress.controller.ts b/control-plane/src/ingress/ingress.controller.ts new file mode 100644 index 0000000..4e7fb43 --- /dev/null +++ b/control-plane/src/ingress/ingress.controller.ts @@ -0,0 +1,36 @@ +import { Body, Controller, Get, HttpCode, Post } from '@nestjs/common'; + +import { IngressService } from './ingress.service'; + +@Controller() +export class IngressController { + constructor(private readonly ingressService: IngressService) {} + + @Get('/health') + health(): { ok: true } { + return { ok: true }; + } + + @Post('/telegram/webhook') + @HttpCode(202) + async telegramWebhook(@Body() body: unknown): Promise<{ accepted: true; taskId: string }> { + const result = await this.ingressService.ingestTelegram(body as never); + return { accepted: true, taskId: result.taskId }; + } + + @Post('/ingress/simulate') + @HttpCode(202) + async simulate( + @Body() body: { text: string; chatId?: string; userId?: string; username?: string } + ): Promise<{ accepted: true; taskId: string }> { + const result = await this.ingressService.ingestTelegram({ + message: { + text: body.text, + chat: { id: body.chatId ?? 'local-sim' }, + from: { id: body.userId ?? 'local-user', username: body.username ?? 'simulator' } + } + }); + + return { accepted: true, taskId: result.taskId }; + } +} diff --git a/control-plane/src/ingress/ingress.module.ts b/control-plane/src/ingress/ingress.module.ts new file mode 100644 index 0000000..b6c682d --- /dev/null +++ b/control-plane/src/ingress/ingress.module.ts @@ -0,0 +1,10 @@ +import { Module } from '@nestjs/common'; + +import { IngressController } from './ingress.controller'; +import { IngressService } from './ingress.service'; + +@Module({ + controllers: [IngressController], + providers: [IngressService] +}) +export class IngressModule {} diff --git a/control-plane/src/ingress/ingress.service.ts b/control-plane/src/ingress/ingress.service.ts new file mode 100644 index 0000000..d2e61e1 --- /dev/null +++ b/control-plane/src/ingress/ingress.service.ts @@ -0,0 +1,68 @@ +import { Injectable, Logger, OnModuleDestroy, OnModuleInit } from '@nestjs/common'; +import { randomUUID } from 'node:crypto'; +import type { NatsConnection } from 'nats'; + +import { type AppConfig, loadConfig } from '../common/config'; +import { type TaskEnvelope } from '../common/contracts'; +import { connectNats, encodeJson, ensureStream } from '../common/nats'; + +interface TelegramUpdate { + message?: { + text?: string; + chat?: { id?: number | string }; + from?: { id?: number | string; username?: string }; + }; +} + +@Injectable() +export class IngressService implements OnModuleInit, OnModuleDestroy { + private readonly logger = new Logger(IngressService.name); + private readonly cfg: AppConfig = loadConfig('ingress'); + private nc: NatsConnection | null = null; + + async onModuleInit(): Promise { + this.nc = await connectNats(this.cfg.natsUrl); + await ensureStream(this.nc, this.cfg.natsStream); + this.logger.log(`Connected to NATS at ${this.cfg.natsUrl}`); + } + + async onModuleDestroy(): Promise { + await this.nc?.drain(); + } + + async ingestTelegram(body: TelegramUpdate): Promise<{ taskId: string }> { + if (!this.nc) { + throw new Error('NATS is not ready'); + } + + const text = body.message?.text?.trim() ?? ''; + const chatId = body.message?.chat?.id ? String(body.message.chat.id) : this.cfg.telegramDefaultChatId; + const userId = body.message?.from?.id ? String(body.message.from.id) : undefined; + const username = body.message?.from?.username; + + if (!text) { + throw new Error('Message text is required'); + } + + const task: TaskEnvelope = { + taskId: randomUUID(), + profile: this.cfg.profile, + source: { + channel: 'telegram', + chatId, + userId, + username + }, + text, + status: 'NEW', + priority: 5, + budgetTokens: 4000, + createdAt: new Date().toISOString() + }; + + this.nc.publish('tasks.ingress', encodeJson(task)); + this.logger.log(`Queued task ${task.taskId} for ingress`); + + return { taskId: task.taskId }; + } +} diff --git a/control-plane/src/ingress/main.ts b/control-plane/src/ingress/main.ts new file mode 100644 index 0000000..b639c65 --- /dev/null +++ b/control-plane/src/ingress/main.ts @@ -0,0 +1,29 @@ +import 'reflect-metadata'; +import { Logger } from '@nestjs/common'; +import { NestFactory } from '@nestjs/core'; + +import { loadConfig } from '../common/config'; +import { initMetrics, startMetricsServer } from '../common/metrics'; +import { IngressModule } from './ingress.module'; + +async function bootstrap(): Promise { + const cfg = loadConfig('ingress'); + const logger = new Logger('IngressMain'); + + const app = await NestFactory.create(IngressModule, { bufferLogs: true }); + + const port = Number.parseInt(process.env.HTTP_PORT ?? '3000', 10); + await app.listen(port, '0.0.0.0'); + + const metrics = initMetrics('ingress'); + startMetricsServer(cfg.metricsPort, metrics.registry); + + logger.log(`Ingress service listening on :${port}`); + logger.log(`Metrics listening on :${cfg.metricsPort}`); +} + +bootstrap().catch((error: unknown) => { + // eslint-disable-next-line no-console + console.error(error); + process.exit(1); +}); diff --git a/control-plane/src/router/main.ts b/control-plane/src/router/main.ts new file mode 100644 index 0000000..98abd4c --- /dev/null +++ b/control-plane/src/router/main.ts @@ -0,0 +1,14 @@ +import 'reflect-metadata'; +import { NestFactory } from '@nestjs/core'; + +import { RouterModule } from './router.module'; + +async function bootstrap(): Promise { + await NestFactory.createApplicationContext(RouterModule, { bufferLogs: true }); +} + +bootstrap().catch((error: unknown) => { + // eslint-disable-next-line no-console + console.error(error); + process.exit(1); +}); diff --git a/control-plane/src/router/router.module.ts b/control-plane/src/router/router.module.ts new file mode 100644 index 0000000..c4bcd00 --- /dev/null +++ b/control-plane/src/router/router.module.ts @@ -0,0 +1,8 @@ +import { Module } from '@nestjs/common'; + +import { RouterRunner } from './router.runner'; + +@Module({ + providers: [RouterRunner] +}) +export class RouterModule {} diff --git a/control-plane/src/router/router.runner.ts b/control-plane/src/router/router.runner.ts new file mode 100644 index 0000000..e364ad9 --- /dev/null +++ b/control-plane/src/router/router.runner.ts @@ -0,0 +1,73 @@ +import { Injectable, Logger, OnModuleDestroy, OnModuleInit } from '@nestjs/common'; +import type { ConsumerMessages, NatsConnection } from 'nats'; + +import { loadConfig } from '../common/config'; +import { classifyIntent } from '../common/intents'; +import { type ServiceMetrics, initMetrics, startMetricsServer } from '../common/metrics'; +import { type TaskEnvelope } from '../common/contracts'; +import { connectNats, decodeJson, encodeJson, ensureConsumer, ensureStream } from '../common/nats'; + +@Injectable() +export class RouterRunner implements OnModuleInit, OnModuleDestroy { + private readonly logger = new Logger(RouterRunner.name); + private readonly cfg = loadConfig('router'); + private nc: NatsConnection | null = null; + private messages: ConsumerMessages | null = null; + private metrics: ServiceMetrics | null = null; + private metricsServer: ReturnType | null = null; + + async onModuleInit(): Promise { + this.metrics = initMetrics('router'); + this.metricsServer = startMetricsServer(this.cfg.metricsPort, this.metrics.registry); + + this.nc = await connectNats(this.cfg.natsUrl); + await ensureStream(this.nc, this.cfg.natsStream); + const consumer = await ensureConsumer(this.nc, this.cfg.natsStream, `${this.cfg.profile}-router`, 'tasks.ingress'); + this.messages = await consumer.consume(); + + this.run().catch((error: unknown) => { + this.logger.error(`Router loop failed: ${String(error)}`); + }); + + this.logger.log(`Router running for profile ${this.cfg.profile}`); + } + + async onModuleDestroy(): Promise { + this.messages?.close(); + await this.nc?.drain(); + this.metricsServer?.close(); + } + + private async run(): Promise { + if (!this.messages || !this.nc) { + throw new Error('Router is not initialized'); + } + + for await (const msg of this.messages) { + try { + const task = decodeJson(msg.data); + const forced = this.cfg.routerForcedAgent.trim(); + const routed = forced + ? { intent: `forced.${forced}`, targetAgent: forced } + : classifyIntent(task.text); + + const enrichedTask: TaskEnvelope = { + ...task, + intent: routed.intent, + targetAgent: routed.targetAgent, + status: 'ROUTED' + }; + + this.nc.publish(`tasks.agent.${routed.targetAgent}`, encodeJson(enrichedTask)); + this.nc.publish('tasks.events', encodeJson({ type: 'task_routed', taskId: task.taskId, routed })); + + this.metrics?.handledMessages.inc(); + msg.ack(); + } catch (error) { + this.metrics?.failedMessages.inc(); + msg.nak(); + this.logger.error(`Failed to route task: ${String(error)}`); + } + } + } +} diff --git a/control-plane/src/worker/main.ts b/control-plane/src/worker/main.ts new file mode 100644 index 0000000..3175342 --- /dev/null +++ b/control-plane/src/worker/main.ts @@ -0,0 +1,14 @@ +import 'reflect-metadata'; +import { NestFactory } from '@nestjs/core'; + +import { WorkerModule } from './worker.module'; + +async function bootstrap(): Promise { + await NestFactory.createApplicationContext(WorkerModule, { bufferLogs: true }); +} + +bootstrap().catch((error: unknown) => { + // eslint-disable-next-line no-console + console.error(error); + process.exit(1); +}); diff --git a/control-plane/src/worker/worker.module.ts b/control-plane/src/worker/worker.module.ts new file mode 100644 index 0000000..b473b59 --- /dev/null +++ b/control-plane/src/worker/worker.module.ts @@ -0,0 +1,8 @@ +import { Module } from '@nestjs/common'; + +import { WorkerRunner } from './worker.runner'; + +@Module({ + providers: [WorkerRunner] +}) +export class WorkerModule {} diff --git a/control-plane/src/worker/worker.runner.ts b/control-plane/src/worker/worker.runner.ts new file mode 100644 index 0000000..97fe4b6 --- /dev/null +++ b/control-plane/src/worker/worker.runner.ts @@ -0,0 +1,87 @@ +import { Injectable, Logger, OnModuleDestroy, OnModuleInit } from '@nestjs/common'; +import type { ConsumerMessages, NatsConnection } from 'nats'; + +import { loadConfig } from '../common/config'; +import { type TaskEnvelope, type TaskResult } from '../common/contracts'; +import { actionNeedsConfirmation } from '../common/intents'; +import { type ServiceMetrics, initMetrics, startMetricsServer } from '../common/metrics'; +import { connectNats, decodeJson, encodeJson, ensureConsumer, ensureStream } from '../common/nats'; + +@Injectable() +export class WorkerRunner implements OnModuleInit, OnModuleDestroy { + private readonly logger = new Logger(WorkerRunner.name); + private readonly cfg = loadConfig('worker'); + private nc: NatsConnection | null = null; + private messages: ConsumerMessages | null = null; + private metrics: ServiceMetrics | null = null; + private metricsServer: ReturnType | null = null; + + async onModuleInit(): Promise { + this.metrics = initMetrics(`worker_${this.cfg.workerAgentId.replace('-', '_')}`); + this.metricsServer = startMetricsServer(this.cfg.metricsPort, this.metrics.registry); + + this.nc = await connectNats(this.cfg.natsUrl); + await ensureStream(this.nc, this.cfg.natsStream); + + const durable = `${this.cfg.profile}-worker-${this.cfg.workerAgentId}`; + const filter = `tasks.agent.${this.cfg.workerAgentId}`; + const consumer = await ensureConsumer(this.nc, this.cfg.natsStream, durable, filter); + this.messages = await consumer.consume(); + + this.run().catch((error: unknown) => this.logger.error(`Worker loop failed: ${String(error)}`)); + + this.logger.log(`Worker ${this.cfg.workerAgentId} online`); + } + + async onModuleDestroy(): Promise { + this.messages?.close(); + await this.nc?.drain(); + this.metricsServer?.close(); + } + + private async run(): Promise { + if (!this.messages || !this.nc) { + throw new Error('Worker is not initialized'); + } + + for await (const msg of this.messages) { + try { + const task = decodeJson(msg.data); + const result = await this.processTask(task); + this.nc.publish(`results.agent.${this.cfg.workerAgentId}`, encodeJson(result)); + this.metrics?.handledMessages.inc(); + msg.ack(); + } catch (error) { + this.metrics?.failedMessages.inc(); + msg.nak(); + this.logger.error(`Failed task processing: ${String(error)}`); + } + } + } + + private async processTask(task: TaskEnvelope): Promise { + const needsConfirmation = actionNeedsConfirmation(task.text); + const summary = `Task ${task.taskId} routed to ${this.cfg.workerAgentId}`; + + const fullResponse = needsConfirmation + ? `Action requires confirmation before execution: ${task.text}` + : `Processed by ${this.cfg.workerAgentId}: ${task.text}`; + + const status: TaskResult['status'] = needsConfirmation ? 'WAITING_CONFIRMATION' : 'DONE'; + + return { + taskId: task.taskId, + profile: task.profile, + fromAgent: this.cfg.workerAgentId, + status, + summary, + fullResponse, + needsConfirmation, + suggestedAction: needsConfirmation ? `confirmar ${task.taskId}` : undefined, + tokenUsage: Math.min(300, task.text.length * 2), + costEstimate: 0, + source: task.source, + createdAt: new Date().toISOString() + }; + } +} diff --git a/control-plane/tsconfig.build.json b/control-plane/tsconfig.build.json new file mode 100644 index 0000000..eacde2c --- /dev/null +++ b/control-plane/tsconfig.build.json @@ -0,0 +1,7 @@ +{ + "extends": "./tsconfig.json", + "compilerOptions": { + "noEmit": false + }, + "exclude": ["node_modules", "dist", "**/*.spec.ts"] +} diff --git a/control-plane/tsconfig.json b/control-plane/tsconfig.json new file mode 100644 index 0000000..7ca417a --- /dev/null +++ b/control-plane/tsconfig.json @@ -0,0 +1,22 @@ +{ + "compilerOptions": { + "module": "commonjs", + "declaration": false, + "removeComments": false, + "emitDecoratorMetadata": true, + "experimentalDecorators": true, + "allowSyntheticDefaultImports": true, + "target": "ES2022", + "sourceMap": true, + "outDir": "./dist", + "baseUrl": "./", + "incremental": false, + "skipLibCheck": true, + "strict": true, + "moduleResolution": "node", + "esModuleInterop": true, + "types": ["node"] + }, + "include": ["src/**/*.ts"], + "exclude": ["node_modules", "dist"] +} diff --git a/docs/control-plane-stage2.md b/docs/control-plane-stage2.md new file mode 100644 index 0000000..3b80cfe --- /dev/null +++ b/docs/control-plane-stage2.md @@ -0,0 +1,73 @@ +--- +title: Stage 2 Control Plane (NATS + NestJS) +summary: Full/lite queue orchestration package installable per profile +--- + +# Stage 2 Control Plane + +This repository now includes a reusable Stage 2 package for queue orchestration and telemetry. + +## Modes + +- `full` (`efra-core`): complete stack + - NATS JetStream + - PostgreSQL state store + - NestJS services: `ingress`, `router`, `broker`, `worker-main`, `worker-research`, `worker-browser-login`, `worker-coolify-ops`, `control-api` + - Observability: Prometheus + Grafana + Uptime Kuma +- `lite` (`andrea`): minimal direct worker path + - NATS JetStream + - PostgreSQL state store + - NestJS services: `ingress`, `router` (forced to `main`), `worker-main`, `broker`, `control-api` + +## Intent Routing + +Ingress receives Telegram/API messages and publishes `tasks.ingress`. +Router classifies intent and emits `tasks.agent.`. +Workers consume per-agent queues and emit `results.agent.`. +Broker persists outputs and can send Telegram replies. + +## Contract + +Task envelope fields: +- `taskId` +- `profile` +- `source.channel/chatId/userId` +- `text` +- `intent` +- `targetAgent` +- `status` + +Result envelope fields: +- `taskId` +- `fromAgent` +- `status` +- `summary` +- `fullResponse` +- `needsConfirmation` + +## Deployment + +Enabled through `playbooks/enterprise.yml` with role `openclaw_control_plane`. + +Inventory variables (`inventories//group_vars/all.yml`): +- `openclaw_control_plane_enabled` +- `openclaw_control_plane_profiles` + +Secrets (`inventories//group_vars/vault.yml`): +- `vault_openclaw_cp_postgres_password_*` +- `vault_openclaw_cp_nats_password_*` +- `vault_telegram_bot_token_*` +- `vault_telegram_default_chat_id_*` + +## Operational Endpoints + +- Ingress: `http://127.0.0.1:/telegram/webhook` +- Control API: `http://127.0.0.1:/tasks` +- Queue stats: `http://127.0.0.1:/queues` +- Grafana (`full` only): `http://127.0.0.1:` +- Prometheus (`full` only): `http://127.0.0.1:` + +## Packaging for other profiles + +To install this package on another profile, add one object to `openclaw_control_plane_profiles`. +No code changes are required, only profile variables and secrets. diff --git a/docs/enterprise-deployment.md b/docs/enterprise-deployment.md new file mode 100644 index 0000000..f927b82 --- /dev/null +++ b/docs/enterprise-deployment.md @@ -0,0 +1,138 @@ +--- +title: Enterprise Deployment +summary: Multi-environment, multi-profile OpenClaw deployment with Ansible +--- + +# Enterprise Deployment + +This repository now includes an enterprise deployment path with: + +- Multi-environment inventories: `dev`, `staging`, `prod`, `research` +- Multi-profile gateway services per host +- Multi-agent profile config generation +- Multi-provider, multi-model defaults (OpenAI + Anthropic) +- Secret isolation via per-profile `EnvironmentFile` +- Stage 2 control-plane package (`full`/`lite`) with NATS + NestJS routing + +## Files + +- Playbook: `playbooks/enterprise.yml` +- Role: `roles/openclaw_enterprise` +- Stage 2 role: `roles/openclaw_control_plane` +- Stage 2 services source: `control-plane/` +- Inventories: `inventories//...` + +## Run + +```bash +ansible-playbook -i inventories/dev/hosts.yml playbooks/enterprise.yml --ask-become-pass +ansible-playbook -i inventories/staging/hosts.yml playbooks/enterprise.yml --ask-become-pass +ansible-playbook -i inventories/prod/hosts.yml playbooks/enterprise.yml --ask-become-pass +ansible-playbook -i inventories/research/hosts.yml playbooks/enterprise.yml --ask-become-pass +``` + +Or use the helper script: + +```bash +./run-enterprise-playbook.sh dev +./run-enterprise-playbook.sh staging +./run-enterprise-playbook.sh prod +./run-enterprise-playbook.sh research +``` + +### Resilient rollout behavior + +`playbooks/enterprise.yml` is configured for resilient multi-node rollout: + +- `serial: 1` (one node at a time) +- `ignore_unreachable: true` (continue when one node is down) +- `any_errors_fatal: false` +- `max_fail_percentage: 100` (do not abort the whole batch on partial failure) + +You can override at runtime: + +```bash +./run-enterprise-playbook.sh dev -e openclaw_rollout_serial=2 +./run-enterprise-playbook.sh dev -e openclaw_ignore_unreachable=false +./run-enterprise-playbook.sh dev -e openclaw_max_fail_percentage=50 +``` + +## Secrets + +Store credentials in Ansible Vault and reference them from `inventories/*/group_vars/all.yml`: + +- `vault_openclaw_gateway_token_*` +- `vault_openai_api_key_*` (optional when using OAuth/browser auth) +- `vault_anthropic_api_key_*` (optional when using OAuth/browser auth) + +The role writes `/etc/openclaw/secrets/.env` with mode `0640`, owner `root`, group `openclaw`. + +### Initialize vault files + +Copy example files and encrypt: + +```bash +cp inventories/dev/group_vars/vault.example.yml inventories/dev/group_vars/vault.yml +cp inventories/staging/group_vars/vault.example.yml inventories/staging/group_vars/vault.yml +cp inventories/prod/group_vars/vault.example.yml inventories/prod/group_vars/vault.yml +cp inventories/research/group_vars/vault.example.yml inventories/research/group_vars/vault.yml + +ansible-vault encrypt inventories/dev/group_vars/vault.yml +ansible-vault encrypt inventories/staging/group_vars/vault.yml +ansible-vault encrypt inventories/prod/group_vars/vault.yml +ansible-vault encrypt inventories/research/group_vars/vault.yml +``` + +`vault.yml` files are git-ignored by default. + +## Service model + +Each profile produces: + +- Config file: `/openclaw.json` +- Systemd unit: `openclaw-gateway-.service` +- Runtime isolation via per-profile `OPENCLAW_PROFILE`, `OPENCLAW_STATE_DIR`, `OPENCLAW_CONFIG_PATH`, `OPENCLAW_GATEWAY_PORT` + +## Android nodes in multi-node topology + +Android is a companion node (`role: node`) that connects to the gateway WebSocket and +must be paired on the gateway side. It does not run the gateway service and should not +be treated as an Ansible SSH target. + +Recommended pattern: + +- Keep Linux gateways in `openclaw_gateway`. +- Keep Android references in `openclaw_mobile_nodes` as inventory metadata. +- Operate pairing/state from a gateway host: + +```bash +openclaw nodes pending +openclaw nodes approve +openclaw nodes status +``` + +For tailnet-only connectivity, bind gateway profiles to tailnet interfaces and use +gateway tailnet IP/MagicDNS from Android. + +## Browser login agent (OpenClaw-managed browser) + +Enterprise profiles can include a dedicated `browser-login` agent with: + +- `tools.profile: full` + `tools.allow: ["browser"]` (browser-only surface) +- `sandbox.mode: "off"` for reliable host login flows on strict sites +- profile-level browser default set to `openclaw` + +Operational flow (from gateway host): + +```bash +openclaw browser --browser-profile openclaw start +openclaw browser --browser-profile openclaw open https://x.com +``` + +Then sign in manually in the managed browser profile. Do not share credentials with the model. + +## Notes + +- Existing `playbook.yml` is unchanged for one-command installs. +- Use `playbooks/enterprise.yml` for multi-node production topology. +- Stage 2 queue orchestration and telemetry details: `docs/control-plane-stage2.md`. diff --git a/docs/epics/EPIC-002-control-plane-stage2.md b/docs/epics/EPIC-002-control-plane-stage2.md new file mode 100644 index 0000000..f199f95 --- /dev/null +++ b/docs/epics/EPIC-002-control-plane-stage2.md @@ -0,0 +1,32 @@ +# EPIC-002 - Stage 2 Queue Control Plane + +Status: In Progress +Owner: platform/devops + +## Goal + +Deliver a reusable package that adds queue orchestration, routing-by-intent, and telemetry for OpenClaw multi-agent deployments. + +## Scope + +- NATS JetStream task bus +- NestJS microservices control plane +- `full` and `lite` deployment modes +- Inventory-driven packaging for reuse in other profiles + +## Acceptance Criteria + +1. `efra-core` profile deploys full stack from Ansible. +2. `andrea` profile deploys lite stack from Ansible. +3. Telegram ingress can route and persist tasks with source + agent attribution. +4. Control API exposes task list and queue stats. +5. Full mode exposes Prometheus/Grafana/Uptime Kuma. +6. Documentation includes install, operations, rollback, and secrets map. + +## Evidence + +- `playbooks/enterprise.yml` includes `openclaw_control_plane` role. +- `roles/openclaw_control_plane/*` templates and tasks render full/lite stacks. +- `control-plane/src/*` contains ingress/router/worker/broker/control-api. +- `inventories/dev/group_vars/all.yml` defines `efra-core` (full) and `andrea` (lite). +- `docs/control-plane-stage2.md` documents runbook. diff --git a/docs/evidence/STAGE2-validation.md b/docs/evidence/STAGE2-validation.md new file mode 100644 index 0000000..f663eec --- /dev/null +++ b/docs/evidence/STAGE2-validation.md @@ -0,0 +1,13 @@ +# Stage 2 Validation Notes + +Date: 2026-03-01 + +## Commands + +- `ANSIBLE_ROLES_PATH=roles ansible-playbook -i inventories/dev/hosts.yml playbooks/enterprise.yml --syntax-check` -> PASS +- `npm install` and build for `control-plane` -> pending in this environment due network timeout during package fetch. + +## Observations + +- Ansible role wiring is valid and syntax-check passes. +- Node dependency install did not complete in this execution environment (timeout without output), so runtime smoke tests are pending host with outbound npm access. diff --git a/docs/stories/STORY-201-full-stack-efra-core.md b/docs/stories/STORY-201-full-stack-efra-core.md new file mode 100644 index 0000000..bdab2a2 --- /dev/null +++ b/docs/stories/STORY-201-full-stack-efra-core.md @@ -0,0 +1,14 @@ +# STORY-201 - Deploy full stack for efra-core + +Status: Done + +## Acceptance Criteria + +- Full compose includes NATS, Postgres, ingress, router, broker, 4 workers, control-api, Prometheus, Grafana, Uptime Kuma. +- Ports bind to loopback only. +- Secrets are read from per-profile variables. + +## Evidence + +- `roles/openclaw_control_plane/templates/docker-compose.full.yml.j2` +- `inventories/dev/group_vars/all.yml` (`openclaw_control_plane_profiles[efra-core]`) diff --git a/docs/stories/STORY-202-lite-stack-andrea.md b/docs/stories/STORY-202-lite-stack-andrea.md new file mode 100644 index 0000000..2cb6d38 --- /dev/null +++ b/docs/stories/STORY-202-lite-stack-andrea.md @@ -0,0 +1,14 @@ +# STORY-202 - Deploy lite stack for andrea + +Status: Done + +## Acceptance Criteria + +- Lite compose includes only minimal services. +- Router forced route to `main` worker. +- Independent profile variables and secrets. + +## Evidence + +- `roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2` +- `inventories/dev/group_vars/all.yml` (`openclaw_control_plane_profiles[andrea]`) diff --git a/docs/stories/STORY-203-intent-routing-and-result-broker.md b/docs/stories/STORY-203-intent-routing-and-result-broker.md new file mode 100644 index 0000000..70b72f7 --- /dev/null +++ b/docs/stories/STORY-203-intent-routing-and-result-broker.md @@ -0,0 +1,16 @@ +# STORY-203 - Intent routing and broker attribution + +Status: Done + +## Acceptance Criteria + +- Router classifies intent and sends to target subject. +- Worker output includes `fromAgent`, `taskId`, and confirmation flag. +- Broker persists task result and sends Telegram reply with attribution. + +## Evidence + +- `control-plane/src/router/router.runner.ts` +- `control-plane/src/worker/worker.runner.ts` +- `control-plane/src/broker/broker.runner.ts` +- `control-plane/src/common/contracts.ts` diff --git a/docs/stories/STORY-204-observability-and-control-api.md b/docs/stories/STORY-204-observability-and-control-api.md new file mode 100644 index 0000000..bd73543 --- /dev/null +++ b/docs/stories/STORY-204-observability-and-control-api.md @@ -0,0 +1,16 @@ +# STORY-204 - Observability and control API + +Status: Done + +## Acceptance Criteria + +- Full mode exports metrics for Prometheus. +- Grafana is provisioned with default datasource. +- Control API provides task listing, details, decisions, and queue stats. + +## Evidence + +- `roles/openclaw_control_plane/templates/prometheus.yml.j2` +- `roles/openclaw_control_plane/templates/grafana-datasources.yml.j2` +- `control-plane/src/control-api/control.controller.ts` +- `control-plane/src/control-api/control.service.ts` diff --git a/inventories/README.md b/inventories/README.md new file mode 100644 index 0000000..22c37ce --- /dev/null +++ b/inventories/README.md @@ -0,0 +1,75 @@ +# Inventories + +Each environment has: + +- `hosts.yml` +- `group_vars/all.yml` +- optional `group_vars/vault.yml` (encrypted with `ansible-vault`) +- `group_vars/vault.example.yml` template + +`hosts.yml` can also include metadata-only node groups (for example `openclaw_mobile_nodes`) +to track Android/iOS companions that pair over Gateway WebSocket. Keep deployment targets +inside `openclaw_gateway`; mobile nodes are not SSH-managed by Ansible. + +Resilience knobs (set in `group_vars/all.yml` or `-e`): + +- `openclaw_rollout_serial` (default `1`) +- `openclaw_ignore_unreachable` (default `true`) +- `openclaw_max_fail_percentage` (default `100`) + +Stage 2 control-plane knobs: + +- `openclaw_control_plane_enabled` +- `openclaw_control_plane_manage_stack` +- `openclaw_control_plane_profiles` (`mode: full|lite`) + +Example: + +```bash +ansible-vault create inventories/prod/group_vars/vault.yml +``` + +Or bootstrap from template: + +```bash +cp inventories/prod/group_vars/vault.example.yml inventories/prod/group_vars/vault.yml +ansible-vault encrypt inventories/prod/group_vars/vault.yml +``` + +`vault.yml` should define secrets referenced in `group_vars/all.yml`, for example: + +```yaml +vault_openclaw_gateway_token_prod_main: "..." +vault_openclaw_gateway_token_prod_rescue: "..." +vault_openai_api_key_prod: "..." +vault_anthropic_api_key_prod: "..." +``` + +`vault_openai_api_key_*` / `vault_anthropic_api_key_*` can be left empty when using OAuth/browser login flows. + +## Android node flow (metadata + operations) + +1. Keep Android inventory entries under `openclaw_mobile_nodes` (metadata only). + - Suggested metadata: `openclaw_node_tailnet_ip`, `openclaw_node_magicdns`, `openclaw_node_gateway`. + - Discover from control host: `sudo tailscale status --json`. +2. Deploy/upgrade gateways with: + - `ansible-playbook -i inventories/dev/hosts.yml playbooks/enterprise.yml --ask-become-pass` +3. Pair Android from a gateway host: + - `openclaw nodes pending` + - `openclaw nodes approve ` +4. Verify runtime connectivity: + - `openclaw nodes status` + +## Browser login agent flow + +Enterprise profile examples include `browser-login` with browser-only access: + +- `tools.profile: full` +- `tools.allow: ["browser"]` +- `sandbox.mode: "off"` +- `browser.defaultProfile: openclaw` + +Manual login runbook on gateway host: + +- `openclaw browser --browser-profile openclaw start` +- `openclaw browser --browser-profile openclaw open https://x.com` diff --git a/inventories/dev/group_vars/all.yml b/inventories/dev/group_vars/all.yml new file mode 100644 index 0000000..b6f9846 --- /dev/null +++ b/inventories/dev/group_vars/all.yml @@ -0,0 +1,97 @@ +--- +openclaw_install_mode: release +openclaw_enterprise_enabled: true +openclaw_rollout_serial: 1 +openclaw_ignore_unreachable: true +openclaw_max_fail_percentage: 100 +openclaw_enterprise_profiles: + - name: dev-main + gateway_port: 19011 + gateway_bind: loopback + browser: + enabled: true + defaultProfile: openclaw + state_dir: /home/openclaw/.openclaw-dev-main + config_path: /home/openclaw/.openclaw-dev-main/openclaw.json + workspace_root: /home/openclaw/.openclaw-dev-main/workspace + model_primary: openai/gpt-5-mini + model_fallbacks: + - anthropic/claude-sonnet-4-5 + - openai/gpt-5.2 + tools_profile: coding + sandbox_mode: non-main + sandbox_scope: session + agents: + - id: main + default: true + workspace: /home/openclaw/.openclaw-dev-main/workspace + - id: research + workspace: /home/openclaw/.openclaw-dev-main/workspace-research + tools: + profile: coding + - id: browser-login + workspace: /home/openclaw/.openclaw-dev-main/workspace-browser-login + sandbox: + mode: "off" + tools: + profile: full + allow: + - browser + bindings: [] + env: + OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_dev_main | default('') }}" + OPENAI_API_KEY: "{{ vault_openai_api_key_dev | default('') }}" + ANTHROPIC_API_KEY: "{{ vault_anthropic_api_key_dev | default('') }}" + + - name: andrea + gateway_port: 19031 + gateway_bind: loopback + state_dir: /home/openclaw/.openclaw-andrea + config_path: /home/openclaw/.openclaw-andrea/openclaw.json + workspace_root: /home/openclaw/.openclaw-andrea/workspace + model_primary: openai/gpt-5-mini + model_fallbacks: + - anthropic/claude-sonnet-4-5 + tools_profile: messaging + sandbox_mode: non-main + sandbox_scope: session + agents: + - id: main + default: true + workspace: /home/openclaw/.openclaw-andrea/workspace + bindings: [] + env: + OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_andrea | default('') }}" + OPENAI_API_KEY: "{{ vault_openai_api_key_dev | default('') }}" + ANTHROPIC_API_KEY: "{{ vault_anthropic_api_key_dev | default('') }}" + +# Stage 2 control-plane package (NATS + NestJS microservices) +openclaw_control_plane_enabled: true +openclaw_control_plane_manage_stack: true +openclaw_control_plane_profiles: + - name: efra-core + mode: full + gateway_profile: dev-main + project_dir: /opt/openclaw/control-plane/efra-core + ingress_port: 30101 + control_api_port: 39101 + grafana_port: 31001 + prometheus_port: 39091 + uptime_kuma_port: 31081 + telegram_bot_token: "{{ vault_telegram_bot_token_efra_core | default('') }}" + telegram_default_chat_id: "{{ vault_telegram_default_chat_id_efra_core | default('') }}" + postgres_password: "{{ vault_openclaw_cp_postgres_password_efra_core | default('') }}" + nats_user: queue + nats_password: "{{ vault_openclaw_cp_nats_password_efra_core | default('') }}" + + - name: andrea + mode: lite + gateway_profile: andrea + project_dir: /opt/openclaw/control-plane/andrea + ingress_port: 30111 + control_api_port: 39111 + telegram_bot_token: "{{ vault_telegram_bot_token_andrea | default('') }}" + telegram_default_chat_id: "{{ vault_telegram_default_chat_id_andrea | default('') }}" + postgres_password: "{{ vault_openclaw_cp_postgres_password_andrea | default('') }}" + nats_user: queue + nats_password: "{{ vault_openclaw_cp_nats_password_andrea | default('') }}" diff --git a/inventories/dev/group_vars/vault.example.yml b/inventories/dev/group_vars/vault.example.yml new file mode 100644 index 0000000..1601d33 --- /dev/null +++ b/inventories/dev/group_vars/vault.example.yml @@ -0,0 +1,16 @@ +--- +vault_openclaw_gateway_token_dev_main: "replace-with-temp-token-dev-main" +vault_openclaw_gateway_token_andrea: "replace-with-temp-token-andrea" +vault_openai_api_key_dev: "replace-with-temp-openai-key-dev" +vault_anthropic_api_key_dev: "replace-with-temp-anthropic-key-dev" + +# Stage 2 control-plane secrets +vault_openclaw_cp_postgres_password_efra_core: "replace-with-strong-password" +vault_openclaw_cp_nats_password_efra_core: "replace-with-strong-password" +vault_telegram_bot_token_efra_core: "replace-with-bot-token" +vault_telegram_default_chat_id_efra_core: "replace-with-chat-id" + +vault_openclaw_cp_postgres_password_andrea: "replace-with-strong-password" +vault_openclaw_cp_nats_password_andrea: "replace-with-strong-password" +vault_telegram_bot_token_andrea: "replace-with-bot-token" +vault_telegram_default_chat_id_andrea: "replace-with-chat-id" diff --git a/inventories/dev/host_vars/.gitkeep b/inventories/dev/host_vars/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/inventories/dev/hosts.yml b/inventories/dev/hosts.yml new file mode 100644 index 0000000..30c6ff1 --- /dev/null +++ b/inventories/dev/hosts.yml @@ -0,0 +1,29 @@ +--- +all: + vars: + ansible_user: efra + ansible_become: true + ansible_become_method: sudo + children: + openclaw_gateway: + hosts: + zennook: + ansible_host: 100.127.119.39 + ansible_connection: local + ansible_user: efra + fedora: + ansible_host: 100.109.82.18 + ansible_user: clawadmin + openclaw_mobile_nodes: + vars: + # Mobile nodes connect to gateway WS and are paired from gateway CLI. + # They are inventory metadata only (not Ansible SSH targets). + openclaw_node_transport: gateway_ws + openclaw_node_role: node + hosts: + android_efra: + openclaw_node_platform: android + openclaw_node_enabled: true + openclaw_node_gateway: zennook + openclaw_node_tailnet_ip: 100.78.147.33 + openclaw_node_magicdns: poco-f8-ultra.tail486c6b.ts.net diff --git a/inventories/prod/group_vars/all.yml b/inventories/prod/group_vars/all.yml new file mode 100644 index 0000000..09a26b3 --- /dev/null +++ b/inventories/prod/group_vars/all.yml @@ -0,0 +1,82 @@ +--- +openclaw_install_mode: release +openclaw_enterprise_enabled: true +openclaw_enterprise_profiles: + - name: prod-main + gateway_port: 18789 + gateway_bind: loopback + browser: + enabled: true + defaultProfile: openclaw + state_dir: /home/openclaw/.openclaw-prod-main + config_path: /home/openclaw/.openclaw-prod-main/openclaw.json + workspace_root: /home/openclaw/.openclaw-prod-main/workspace + model_primary: anthropic/claude-opus-4-6 + model_fallbacks: + - openai/gpt-5.2 + - anthropic/claude-sonnet-4-5 + - openai/gpt-5-mini + tools_profile: coding + sandbox_mode: non-main + sandbox_scope: session + max_concurrent: 8 + context_tokens: 250000 + agents: + - id: main + default: true + workspace: /home/openclaw/.openclaw-prod-main/workspace + - id: ops + workspace: /home/openclaw/.openclaw-prod-main/workspace-ops + tools: + profile: messaging + - id: research + workspace: /home/openclaw/.openclaw-prod-main/workspace-research + tools: + profile: coding + - id: browser-login + workspace: /home/openclaw/.openclaw-prod-main/workspace-browser-login + sandbox: + mode: "off" + tools: + profile: full + allow: + - browser + bindings: [] + env: + OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_prod_main | default('') }}" + OPENAI_API_KEY: "{{ vault_openai_api_key_prod | default('') }}" + ANTHROPIC_API_KEY: "{{ vault_anthropic_api_key_prod | default('') }}" + + - name: prod-rescue + gateway_port: 19789 + gateway_bind: loopback + browser: + enabled: true + defaultProfile: openclaw + state_dir: /home/openclaw/.openclaw-prod-rescue + config_path: /home/openclaw/.openclaw-prod-rescue/openclaw.json + workspace_root: /home/openclaw/.openclaw-prod-rescue/workspace + model_primary: openai/gpt-5-mini + model_fallbacks: + - anthropic/claude-sonnet-4-5 + - openai/gpt-5.2 + tools_profile: messaging + sandbox_mode: all + sandbox_scope: agent + agents: + - id: main + default: true + workspace: /home/openclaw/.openclaw-prod-rescue/workspace + - id: browser-login + workspace: /home/openclaw/.openclaw-prod-rescue/workspace-browser-login + sandbox: + mode: "off" + tools: + profile: full + allow: + - browser + bindings: [] + env: + OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_prod_rescue | default('') }}" + OPENAI_API_KEY: "{{ vault_openai_api_key_prod | default('') }}" + ANTHROPIC_API_KEY: "{{ vault_anthropic_api_key_prod | default('') }}" diff --git a/inventories/prod/group_vars/vault.example.yml b/inventories/prod/group_vars/vault.example.yml new file mode 100644 index 0000000..2acb1fd --- /dev/null +++ b/inventories/prod/group_vars/vault.example.yml @@ -0,0 +1,5 @@ +--- +vault_openclaw_gateway_token_prod_main: "replace-with-temp-token-prod-main" +vault_openclaw_gateway_token_prod_rescue: "replace-with-temp-token-prod-rescue" +vault_openai_api_key_prod: "replace-with-temp-openai-key-prod" +vault_anthropic_api_key_prod: "replace-with-temp-anthropic-key-prod" diff --git a/inventories/prod/host_vars/.gitkeep b/inventories/prod/host_vars/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/inventories/prod/hosts.yml b/inventories/prod/hosts.yml new file mode 100644 index 0000000..1aaff16 --- /dev/null +++ b/inventories/prod/hosts.yml @@ -0,0 +1,16 @@ +--- +all: + children: + openclaw_gateway: + hosts: + prod-gateway-1: + ansible_host: 10.30.10.11 + ansible_user: ubuntu + prod-gateway-2: + ansible_host: 10.30.10.12 + ansible_user: ubuntu + openclaw_mobile_nodes: + vars: + openclaw_node_transport: gateway_ws + openclaw_node_role: node + hosts: {} diff --git a/inventories/research/group_vars/all.yml b/inventories/research/group_vars/all.yml new file mode 100644 index 0000000..8907e59 --- /dev/null +++ b/inventories/research/group_vars/all.yml @@ -0,0 +1,41 @@ +--- +openclaw_install_mode: development +openclaw_enterprise_enabled: true +openclaw_enterprise_profiles: + - name: research-main + gateway_port: 19211 + gateway_bind: loopback + browser: + enabled: true + defaultProfile: openclaw + state_dir: /home/openclaw/.openclaw-research-main + config_path: /home/openclaw/.openclaw-research-main/openclaw.json + workspace_root: /home/openclaw/.openclaw-research-main/workspace + model_primary: openai/gpt-5.2 + model_fallbacks: + - anthropic/claude-opus-4-6 + - openai/gpt-5-mini + tools_profile: coding + sandbox_mode: all + sandbox_scope: session + agents: + - id: main + default: true + workspace: /home/openclaw/.openclaw-research-main/workspace + - id: experiments + workspace: /home/openclaw/.openclaw-research-main/workspace-experiments + tools: + profile: coding + - id: browser-login + workspace: /home/openclaw/.openclaw-research-main/workspace-browser-login + sandbox: + mode: "off" + tools: + profile: full + allow: + - browser + bindings: [] + env: + OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_research_main | default('') }}" + OPENAI_API_KEY: "{{ vault_openai_api_key_research | default('') }}" + ANTHROPIC_API_KEY: "{{ vault_anthropic_api_key_research | default('') }}" diff --git a/inventories/research/group_vars/vault.example.yml b/inventories/research/group_vars/vault.example.yml new file mode 100644 index 0000000..a4ef5d9 --- /dev/null +++ b/inventories/research/group_vars/vault.example.yml @@ -0,0 +1,4 @@ +--- +vault_openclaw_gateway_token_research_main: "replace-with-temp-token-research-main" +vault_openai_api_key_research: "replace-with-temp-openai-key-research" +vault_anthropic_api_key_research: "replace-with-temp-anthropic-key-research" diff --git a/inventories/research/host_vars/.gitkeep b/inventories/research/host_vars/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/inventories/research/hosts.yml b/inventories/research/hosts.yml new file mode 100644 index 0000000..40618f5 --- /dev/null +++ b/inventories/research/hosts.yml @@ -0,0 +1,13 @@ +--- +all: + children: + openclaw_gateway: + hosts: + rsch-gateway-1: + ansible_host: 10.40.10.11 + ansible_user: ubuntu + openclaw_mobile_nodes: + vars: + openclaw_node_transport: gateway_ws + openclaw_node_role: node + hosts: {} diff --git a/inventories/staging/group_vars/all.yml b/inventories/staging/group_vars/all.yml new file mode 100644 index 0000000..4ce74d9 --- /dev/null +++ b/inventories/staging/group_vars/all.yml @@ -0,0 +1,41 @@ +--- +openclaw_install_mode: release +openclaw_enterprise_enabled: true +openclaw_enterprise_profiles: + - name: stg-main + gateway_port: 19111 + gateway_bind: loopback + browser: + enabled: true + defaultProfile: openclaw + state_dir: /home/openclaw/.openclaw-stg-main + config_path: /home/openclaw/.openclaw-stg-main/openclaw.json + workspace_root: /home/openclaw/.openclaw-stg-main/workspace + model_primary: anthropic/claude-sonnet-4-5 + model_fallbacks: + - openai/gpt-5.2 + - openai/gpt-5-mini + tools_profile: coding + sandbox_mode: non-main + sandbox_scope: session + agents: + - id: main + default: true + workspace: /home/openclaw/.openclaw-stg-main/workspace + - id: ops + workspace: /home/openclaw/.openclaw-stg-main/workspace-ops + tools: + profile: messaging + - id: browser-login + workspace: /home/openclaw/.openclaw-stg-main/workspace-browser-login + sandbox: + mode: "off" + tools: + profile: full + allow: + - browser + bindings: [] + env: + OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_stg_main | default('') }}" + OPENAI_API_KEY: "{{ vault_openai_api_key_stg | default('') }}" + ANTHROPIC_API_KEY: "{{ vault_anthropic_api_key_stg | default('') }}" diff --git a/inventories/staging/group_vars/vault.example.yml b/inventories/staging/group_vars/vault.example.yml new file mode 100644 index 0000000..17738dd --- /dev/null +++ b/inventories/staging/group_vars/vault.example.yml @@ -0,0 +1,4 @@ +--- +vault_openclaw_gateway_token_stg_main: "replace-with-temp-token-stg-main" +vault_openai_api_key_stg: "replace-with-temp-openai-key-stg" +vault_anthropic_api_key_stg: "replace-with-temp-anthropic-key-stg" diff --git a/inventories/staging/host_vars/.gitkeep b/inventories/staging/host_vars/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/inventories/staging/hosts.yml b/inventories/staging/hosts.yml new file mode 100644 index 0000000..7c74f3e --- /dev/null +++ b/inventories/staging/hosts.yml @@ -0,0 +1,13 @@ +--- +all: + children: + openclaw_gateway: + hosts: + stg-gateway-1: + ansible_host: 10.20.10.11 + ansible_user: ubuntu + openclaw_mobile_nodes: + vars: + openclaw_node_transport: gateway_ws + openclaw_node_role: node + hosts: {} diff --git a/playbooks/enterprise.yml b/playbooks/enterprise.yml new file mode 100644 index 0000000..2da1b33 --- /dev/null +++ b/playbooks/enterprise.yml @@ -0,0 +1,32 @@ +--- +- name: Deploy OpenClaw enterprise topology + hosts: openclaw_gateway + become: true + strategy: linear + serial: "{{ openclaw_rollout_serial | default(1) }}" + ignore_unreachable: "{{ openclaw_ignore_unreachable | default(true) }}" + any_errors_fatal: false + max_fail_percentage: "{{ openclaw_max_fail_percentage | default(100) }}" + + vars: + ansible_python_interpreter: /usr/bin/python3 + + pre_tasks: + - name: Report unsupported OS hosts (skipping deployment roles) + ansible.builtin.debug: + msg: >- + Skipping OpenClaw roles on {{ inventory_hostname }} ({{ ansible_distribution }} + {{ ansible_distribution_version }}): supported distributions are Debian/Ubuntu/Fedora. + when: ansible_distribution not in ['Debian', 'Ubuntu', 'Fedora'] + + roles: + - role: openclaw + when: ansible_distribution in ['Debian', 'Ubuntu', 'Fedora'] + - role: openclaw_enterprise + when: + - openclaw_enterprise_enabled | bool + - ansible_distribution in ['Debian', 'Ubuntu', 'Fedora'] + - role: openclaw_control_plane + when: + - openclaw_control_plane_enabled | bool + - ansible_distribution in ['Debian', 'Ubuntu', 'Fedora'] diff --git a/roles/openclaw_control_plane/defaults/main.yml b/roles/openclaw_control_plane/defaults/main.yml new file mode 100644 index 0000000..33bd5c3 --- /dev/null +++ b/roles/openclaw_control_plane/defaults/main.yml @@ -0,0 +1,40 @@ +--- +openclaw_control_plane_enabled: false +openclaw_control_plane_manage_stack: true +openclaw_control_plane_require_secrets: true +openclaw_control_plane_runtime_root: /opt/openclaw/control-plane +openclaw_control_plane_source_dir: "{{ playbook_dir }}/../control-plane" +openclaw_control_plane_project_prefix: ocp +openclaw_control_plane_default_nats_stream: OPENCLAW_TASKS + +# Modes: +# - full: all services (router + broker + workers + observability) +# - lite: direct/simple path (ingress + router-forced-main + worker-main + broker + control-api) +openclaw_control_plane_profiles: [] + +# Example: +# openclaw_control_plane_profiles: +# - name: efra-core +# mode: full +# gateway_profile: dev-main +# project_dir: /opt/openclaw/control-plane/efra-core +# ingress_port: 30101 +# control_api_port: 39101 +# grafana_port: 31001 +# prometheus_port: 39091 +# telegram_bot_token: "{{ vault_telegram_bot_token_efra_core }}" +# telegram_default_chat_id: "{{ vault_telegram_default_chat_id_efra_core }}" +# postgres_password: "{{ vault_openclaw_cp_postgres_password_efra_core }}" +# nats_user: queue +# nats_password: "{{ vault_openclaw_cp_nats_password_efra_core }}" +# +# - name: andrea +# mode: lite +# gateway_profile: andrea +# ingress_port: 30111 +# control_api_port: 39111 +# telegram_bot_token: "{{ vault_telegram_bot_token_andrea }}" +# telegram_default_chat_id: "{{ vault_telegram_default_chat_id_andrea }}" +# postgres_password: "{{ vault_openclaw_cp_postgres_password_andrea }}" +# nats_user: queue +# nats_password: "{{ vault_openclaw_cp_nats_password_andrea }}" diff --git a/roles/openclaw_control_plane/tasks/main.yml b/roles/openclaw_control_plane/tasks/main.yml new file mode 100644 index 0000000..bdca0fb --- /dev/null +++ b/roles/openclaw_control_plane/tasks/main.yml @@ -0,0 +1,70 @@ +--- +- name: Validate control-plane profiles list + ansible.builtin.assert: + that: + - openclaw_control_plane_profiles | length > 0 + fail_msg: >- + openclaw_control_plane_enabled=true requires openclaw_control_plane_profiles + with at least one profile. + +- name: Validate control-plane profile schema + ansible.builtin.assert: + that: + - profile.name is defined + - profile.name | length > 0 + - profile.mode is defined + - profile.mode in ['full', 'lite'] + fail_msg: "Each control-plane profile requires non-empty name and mode in [full, lite]." + loop: "{{ openclaw_control_plane_profiles }}" + loop_control: + loop_var: profile + +- name: Validate unique control-plane profile names + ansible.builtin.assert: + that: + - (openclaw_control_plane_profiles | map(attribute='name') | list | unique | length) == + (openclaw_control_plane_profiles | length) + fail_msg: "Control-plane profile names must be unique." + +- name: Validate required secrets in control-plane profile + ansible.builtin.assert: + that: + - profile.postgres_password is defined + - profile.postgres_password | length > 0 + - profile.nats_password is defined + - profile.nats_password | length > 0 + fail_msg: "Profile {{ profile.name }} requires postgres_password and nats_password." + when: openclaw_control_plane_require_secrets | bool + loop: "{{ openclaw_control_plane_profiles }}" + loop_control: + loop_var: profile + +- name: Ensure control-plane runtime root exists + ansible.builtin.file: + path: "{{ openclaw_control_plane_runtime_root }}" + state: directory + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0755' + +- name: Ensure control-plane source directory exists on target + ansible.builtin.file: + path: "{{ openclaw_control_plane_runtime_root }}/source" + state: directory + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0755' + +- name: Sync control-plane source to target + ansible.builtin.copy: + src: "{{ openclaw_control_plane_source_dir }}/" + dest: "{{ openclaw_control_plane_runtime_root }}/source/" + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: preserve + +- name: Configure and deploy each control-plane profile + ansible.builtin.include_tasks: profile.yml + loop: "{{ openclaw_control_plane_profiles }}" + loop_control: + loop_var: profile diff --git a/roles/openclaw_control_plane/tasks/profile.yml b/roles/openclaw_control_plane/tasks/profile.yml new file mode 100644 index 0000000..51eef68 --- /dev/null +++ b/roles/openclaw_control_plane/tasks/profile.yml @@ -0,0 +1,91 @@ +--- +- name: Set control-plane project dir fact + ansible.builtin.set_fact: + openclaw_control_plane_profile_dir: "{{ profile.project_dir | default(openclaw_control_plane_runtime_root ~ '/' ~ profile.name) }}" + +- name: Ensure profile project root exists + ansible.builtin.file: + path: "{{ openclaw_control_plane_profile_dir }}" + state: directory + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0755' + +- name: Ensure profile project directories exist + ansible.builtin.file: + path: "{{ openclaw_control_plane_profile_dir }}/{{ item }}" + state: directory + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0755' + loop: + - data + - prometheus + - grafana/provisioning/datasources + - grafana/dashboards + - loki + +- name: Render control-plane env file + ansible.builtin.template: + src: control-plane.env.j2 + dest: "{{ openclaw_control_plane_profile_dir }}/.env" + owner: root + group: "{{ openclaw_user }}" + mode: '0640' + no_log: true + +- name: Render compose stack for full mode + ansible.builtin.template: + src: docker-compose.full.yml.j2 + dest: "{{ openclaw_control_plane_profile_dir }}/docker-compose.yml" + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0644' + when: profile.mode == 'full' + +- name: Render compose stack for lite mode + ansible.builtin.template: + src: docker-compose.lite.yml.j2 + dest: "{{ openclaw_control_plane_profile_dir }}/docker-compose.yml" + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0644' + when: profile.mode == 'lite' + +- name: Render Prometheus config + ansible.builtin.template: + src: prometheus.yml.j2 + dest: "{{ openclaw_control_plane_profile_dir }}/prometheus/prometheus.yml" + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0644' + +- name: Render Grafana datasource provisioning + ansible.builtin.template: + src: grafana-datasources.yml.j2 + dest: "{{ openclaw_control_plane_profile_dir }}/grafana/provisioning/datasources/datasource.yml" + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0644' + +- name: Deploy control-plane stack with Docker Compose + community.docker.docker_compose_v2: + project_src: "{{ openclaw_control_plane_profile_dir }}" + project_name: "{{ openclaw_control_plane_project_prefix }}-{{ profile.name }}" + files: + - docker-compose.yml + state: present + build: always + remove_orphans: true + when: openclaw_control_plane_manage_stack | bool + +- name: Probe control API health + ansible.builtin.uri: + url: "http://127.0.0.1:{{ profile.control_api_port | default(39090) }}/health" + method: GET + status_code: 200 + register: profile_health + retries: 10 + delay: 3 + until: profile_health is succeeded + when: openclaw_control_plane_manage_stack | bool diff --git a/roles/openclaw_control_plane/templates/control-plane.env.j2 b/roles/openclaw_control_plane/templates/control-plane.env.j2 new file mode 100644 index 0000000..2162764 --- /dev/null +++ b/roles/openclaw_control_plane/templates/control-plane.env.j2 @@ -0,0 +1,13 @@ +OPENCLAW_PROFILE={{ profile.gateway_profile | default(profile.name) }} +NATS_URL=nats://{{ profile.nats_user | default('queue') }}:{{ profile.nats_password }}@nats:4222 +NATS_STREAM={{ profile.nats_stream | default(openclaw_control_plane_default_nats_stream) }} +NATS_USER={{ profile.nats_user | default('queue') }} +NATS_PASSWORD={{ profile.nats_password }} +POSTGRES_USER={{ profile.postgres_user | default('openclaw') }} +POSTGRES_PASSWORD={{ profile.postgres_password }} +POSTGRES_DB={{ profile.postgres_db | default('openclaw_control') }} +POSTGRES_URL=postgres://{{ profile.postgres_user | default('openclaw') }}:{{ profile.postgres_password }}@postgres:5432/{{ profile.postgres_db | default('openclaw_control') }} +TELEGRAM_BOT_TOKEN={{ profile.telegram_bot_token | default('') }} +TELEGRAM_DEFAULT_CHAT_ID={{ profile.telegram_default_chat_id | default('') }} +ROUTER_FORCED_AGENT={{ profile.router_forced_agent | default('') }} +GRAFANA_ADMIN_PASSWORD={{ profile.grafana_admin_password | default('openclaw') }} diff --git a/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 b/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 new file mode 100644 index 0000000..3e66e7d --- /dev/null +++ b/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 @@ -0,0 +1,183 @@ +services: + nats: + image: nats:2.10-alpine + command: ["-js", "-sd", "/data", "-m", "8222", "--user", "${NATS_USER}", "--pass", "${NATS_PASSWORD}"] + restart: unless-stopped + volumes: + - ./data/nats:/data + + postgres: + image: postgres:16-alpine + restart: unless-stopped + environment: + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_DB: ${POSTGRES_DB} + volumes: + - ./data/postgres:/var/lib/postgresql/data + + ingress: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: ingress + env_file: .env + environment: + HTTP_PORT: 3000 + METRICS_PORT: 9401 + restart: unless-stopped + depends_on: + - nats + - postgres + ports: + - "127.0.0.1:{{ profile.ingress_port | default(30101) }}:3000" + + router: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: router + env_file: .env + environment: + METRICS_PORT: 9402 + ROUTER_FORCED_AGENT: "" + restart: unless-stopped + depends_on: + - nats + - postgres + + broker: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: broker + env_file: .env + environment: + METRICS_PORT: 9403 + restart: unless-stopped + depends_on: + - nats + - postgres + + worker-main: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: worker + env_file: .env + environment: + METRICS_PORT: 9411 + WORKER_AGENT_ID: main + restart: unless-stopped + depends_on: + - nats + - postgres + + worker-research: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: worker + env_file: .env + environment: + METRICS_PORT: 9412 + WORKER_AGENT_ID: research + restart: unless-stopped + depends_on: + - nats + - postgres + + worker-browser-login: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: worker + env_file: .env + environment: + METRICS_PORT: 9413 + WORKER_AGENT_ID: browser-login + restart: unless-stopped + depends_on: + - nats + - postgres + + worker-coolify-ops: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: worker + env_file: .env + environment: + METRICS_PORT: 9414 + WORKER_AGENT_ID: coolify-ops + restart: unless-stopped + depends_on: + - nats + - postgres + + control-api: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: control-api + env_file: .env + environment: + HTTP_PORT: 39090 + METRICS_PORT: 9405 + restart: unless-stopped + depends_on: + - nats + - postgres + ports: + - "127.0.0.1:{{ profile.control_api_port | default(39101) }}:39090" + + nats-exporter: + image: natsio/prometheus-nats-exporter:0.17.2 + command: ["-varz", "http://nats:8222"] + restart: unless-stopped + depends_on: + - nats + + prometheus: + image: prom/prometheus:v2.54.1 + command: + - --config.file=/etc/prometheus/prometheus.yml + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./data/prometheus:/prometheus + restart: unless-stopped + ports: + - "127.0.0.1:{{ profile.prometheus_port | default(39091) }}:9090" + depends_on: + - nats-exporter + - ingress + - control-api + + grafana: + image: grafana/grafana:11.2.2 + environment: + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD} + volumes: + - ./data/grafana:/var/lib/grafana + - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro + restart: unless-stopped + depends_on: + - prometheus + ports: + - "127.0.0.1:{{ profile.grafana_port | default(31001) }}:3000" + + uptime-kuma: + image: louislam/uptime-kuma:1.23.13 + restart: unless-stopped + volumes: + - ./data/uptime-kuma:/app/data + ports: + - "127.0.0.1:{{ profile.uptime_kuma_port | default(31081) }}:3001" diff --git a/roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2 b/roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2 new file mode 100644 index 0000000..9589a15 --- /dev/null +++ b/roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2 @@ -0,0 +1,95 @@ +services: + nats: + image: nats:2.10-alpine + command: ["-js", "-sd", "/data", "-m", "8222", "--user", "${NATS_USER}", "--pass", "${NATS_PASSWORD}"] + restart: unless-stopped + volumes: + - ./data/nats:/data + + postgres: + image: postgres:16-alpine + restart: unless-stopped + environment: + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_DB: ${POSTGRES_DB} + volumes: + - ./data/postgres:/var/lib/postgresql/data + + ingress: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: ingress + env_file: .env + environment: + HTTP_PORT: 3000 + METRICS_PORT: 9401 + restart: unless-stopped + depends_on: + - nats + - postgres + ports: + - "127.0.0.1:{{ profile.ingress_port | default(30111) }}:3000" + + router: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: router + env_file: .env + environment: + METRICS_PORT: 9402 + ROUTER_FORCED_AGENT: main + restart: unless-stopped + depends_on: + - nats + - postgres + + worker-main: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: worker + env_file: .env + environment: + METRICS_PORT: 9411 + WORKER_AGENT_ID: main + restart: unless-stopped + depends_on: + - nats + - postgres + + broker: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: broker + env_file: .env + environment: + METRICS_PORT: 9403 + restart: unless-stopped + depends_on: + - nats + - postgres + + control-api: + build: + context: {{ openclaw_control_plane_runtime_root }}/source + dockerfile: Dockerfile + args: + SERVICE: control-api + env_file: .env + environment: + HTTP_PORT: 39090 + METRICS_PORT: 9405 + restart: unless-stopped + depends_on: + - nats + - postgres + ports: + - "127.0.0.1:{{ profile.control_api_port | default(39111) }}:39090" diff --git a/roles/openclaw_control_plane/templates/grafana-datasources.yml.j2 b/roles/openclaw_control_plane/templates/grafana-datasources.yml.j2 new file mode 100644 index 0000000..bb009bb --- /dev/null +++ b/roles/openclaw_control_plane/templates/grafana-datasources.yml.j2 @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false diff --git a/roles/openclaw_control_plane/templates/prometheus.yml.j2 b/roles/openclaw_control_plane/templates/prometheus.yml.j2 new file mode 100644 index 0000000..d8cd375 --- /dev/null +++ b/roles/openclaw_control_plane/templates/prometheus.yml.j2 @@ -0,0 +1,41 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: nats + static_configs: + - targets: ['nats-exporter:7777'] + + - job_name: ingress + static_configs: + - targets: ['ingress:9401'] + + - job_name: router + static_configs: + - targets: ['router:9402'] + + - job_name: broker + static_configs: + - targets: ['broker:9403'] + + - job_name: worker-main + static_configs: + - targets: ['worker-main:9411'] + + - job_name: control-api + static_configs: + - targets: ['control-api:9405'] + +{% if profile.mode == 'full' %} + - job_name: worker-research + static_configs: + - targets: ['worker-research:9412'] + + - job_name: worker-browser-login + static_configs: + - targets: ['worker-browser-login:9413'] + + - job_name: worker-coolify-ops + static_configs: + - targets: ['worker-coolify-ops:9414'] +{% endif %} diff --git a/roles/openclaw_enterprise/defaults/main.yml b/roles/openclaw_enterprise/defaults/main.yml new file mode 100644 index 0000000..838a692 --- /dev/null +++ b/roles/openclaw_enterprise/defaults/main.yml @@ -0,0 +1,47 @@ +--- +openclaw_enterprise_enabled: false +openclaw_enterprise_manage_services: true +openclaw_enterprise_require_secrets: true +openclaw_enterprise_require_provider_api_keys: false +openclaw_enterprise_secret_dir: /etc/openclaw/secrets +openclaw_enterprise_openclaw_bin: "{{ openclaw_home }}/.local/bin/openclaw" + +# Example profile object fields: +# - name: prod-main +# gateway_port: 18789 +# gateway_bind: loopback +# state_dir: /home/openclaw/.openclaw-prod-main +# config_path: /home/openclaw/.openclaw-prod-main/openclaw.json +# workspace_root: /home/openclaw/.openclaw-prod-main/workspace +# env: +# OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_prod_main }}" +# OPENAI_API_KEY: "{{ vault_openai_api_key_prod }}" +# ANTHROPIC_API_KEY: "{{ vault_anthropic_api_key_prod }}" +# agents: +# - id: main +# default: true +# workspace: /home/openclaw/.openclaw-prod-main/workspace +# - id: ops +# workspace: /home/openclaw/.openclaw-prod-main/workspace-ops +# tools: +# profile: messaging +# bindings: [] +openclaw_enterprise_profiles: [] + +openclaw_enterprise_default_models: + "openai/gpt-5.2": + alias: openai-premium + params: + temperature: 0.2 + "openai/gpt-5-mini": + alias: openai-fast + params: + temperature: 0.1 + "anthropic/claude-opus-4-6": + alias: anthropic-premium + params: + temperature: 0.2 + "anthropic/claude-sonnet-4-5": + alias: anthropic-fast + params: + temperature: 0.1 diff --git a/roles/openclaw_enterprise/handlers/main.yml b/roles/openclaw_enterprise/handlers/main.yml new file mode 100644 index 0000000..17e4edc --- /dev/null +++ b/roles/openclaw_enterprise/handlers/main.yml @@ -0,0 +1,14 @@ +--- +- name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: true + +- name: Restart enterprise gateways + ansible.builtin.systemd: + name: "openclaw-gateway-{{ profile.name }}" + state: restarted + enabled: true + loop: "{{ openclaw_enterprise_profiles }}" + loop_control: + loop_var: profile + when: openclaw_enterprise_manage_services | bool diff --git a/roles/openclaw_enterprise/tasks/main.yml b/roles/openclaw_enterprise/tasks/main.yml new file mode 100644 index 0000000..a30aa3c --- /dev/null +++ b/roles/openclaw_enterprise/tasks/main.yml @@ -0,0 +1,152 @@ +--- +- name: Validate enterprise profile list is present + ansible.builtin.assert: + that: + - openclaw_enterprise_profiles | length > 0 + fail_msg: "openclaw_enterprise_enabled=true requires openclaw_enterprise_profiles with at least one profile." + +- name: Validate required profile keys + ansible.builtin.assert: + that: + - profile.name is defined + - profile.name | length > 0 + - profile.gateway_port is defined + - profile.gateway_port | int > 0 + fail_msg: "Each profile must define non-empty name and positive gateway_port." + loop: "{{ openclaw_enterprise_profiles }}" + loop_control: + loop_var: profile + +- name: Validate unique enterprise profile names + ansible.builtin.assert: + that: + - (openclaw_enterprise_profiles | map(attribute='name') | list | unique | length) == + (openclaw_enterprise_profiles | length) + fail_msg: "Each profile name must be unique." + +- name: Validate unique enterprise gateway ports + ansible.builtin.assert: + that: + - (openclaw_enterprise_profiles | map(attribute='gateway_port') | list | unique | length) == + (openclaw_enterprise_profiles | length) + fail_msg: "Each profile gateway_port must be unique." + +- name: Ensure secrets directory exists + ansible.builtin.file: + path: "{{ openclaw_enterprise_secret_dir }}" + state: directory + owner: root + group: "{{ openclaw_user }}" + mode: '0750' + +- name: Check OpenClaw binary exists for enterprise services + ansible.builtin.stat: + path: "{{ openclaw_enterprise_openclaw_bin }}" + register: openclaw_enterprise_bin + when: openclaw_enterprise_manage_services | bool + +- name: Fail when OpenClaw binary is missing + ansible.builtin.fail: + msg: >- + OpenClaw binary not found at {{ openclaw_enterprise_openclaw_bin }}. + Ensure role 'openclaw' completed successfully before enabling enterprise services. + when: + - openclaw_enterprise_manage_services | bool + - not openclaw_enterprise_bin.stat.exists + +- name: Ensure profile state directory exists + ansible.builtin.file: + path: "{{ profile.state_dir | default(openclaw_home ~ '/.openclaw-' ~ profile.name) }}" + state: directory + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0755' + loop: "{{ openclaw_enterprise_profiles }}" + loop_control: + loop_var: profile + +- name: Ensure profile workspace directory exists + ansible.builtin.file: + path: >- + {{ + profile.workspace_root + | default((profile.state_dir | default(openclaw_home ~ '/.openclaw-' ~ profile.name)) ~ '/workspace') + }} + state: directory + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0755' + loop: "{{ openclaw_enterprise_profiles }}" + loop_control: + loop_var: profile + +- name: Validate required secret keys in profile env + ansible.builtin.assert: + that: + - profile.env is defined + - profile.env.OPENCLAW_GATEWAY_TOKEN is defined + - profile.env.OPENCLAW_GATEWAY_TOKEN | length > 0 + - >- + (not (openclaw_enterprise_require_provider_api_keys | bool)) or + (profile.env.OPENAI_API_KEY is defined and profile.env.OPENAI_API_KEY | length > 0) + - >- + (not (openclaw_enterprise_require_provider_api_keys | bool)) or + (profile.env.ANTHROPIC_API_KEY is defined and profile.env.ANTHROPIC_API_KEY | length > 0) + fail_msg: >- + Profile {{ profile.name }} must define env.OPENCLAW_GATEWAY_TOKEN. + OPENAI/ANTHROPIC keys are required only when + openclaw_enterprise_require_provider_api_keys=true. + when: openclaw_enterprise_require_secrets | bool + loop: "{{ openclaw_enterprise_profiles }}" + loop_control: + loop_var: profile + +- name: Render enterprise profile config + ansible.builtin.template: + src: openclaw-profile.json.j2 + dest: "{{ profile.config_path | default((profile.state_dir | default(openclaw_home ~ '/.openclaw-' ~ profile.name)) ~ '/openclaw.json') }}" + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0640' + loop: "{{ openclaw_enterprise_profiles }}" + loop_control: + loop_var: profile + notify: Restart enterprise gateways + +- name: Render per-profile secret environment file + ansible.builtin.template: + src: profile.env.j2 + dest: "{{ openclaw_enterprise_secret_dir }}/{{ profile.name }}.env" + owner: root + group: "{{ openclaw_user }}" + mode: '0640' + loop: "{{ openclaw_enterprise_profiles }}" + loop_control: + loop_var: profile + notify: Restart enterprise gateways + no_log: true + +- name: Render per-profile systemd unit + ansible.builtin.template: + src: openclaw-gateway-profile.service.j2 + dest: "/etc/systemd/system/openclaw-gateway-{{ profile.name }}.service" + owner: root + group: root + mode: '0644' + loop: "{{ openclaw_enterprise_profiles }}" + loop_control: + loop_var: profile + when: openclaw_enterprise_manage_services | bool + notify: + - Reload systemd daemon + - Restart enterprise gateways + +- name: Ensure enterprise gateway services are enabled and started + ansible.builtin.systemd: + name: "openclaw-gateway-{{ profile.name }}" + enabled: true + state: started + loop: "{{ openclaw_enterprise_profiles }}" + loop_control: + loop_var: profile + when: openclaw_enterprise_manage_services | bool diff --git a/roles/openclaw_enterprise/templates/openclaw-gateway-profile.service.j2 b/roles/openclaw_enterprise/templates/openclaw-gateway-profile.service.j2 new file mode 100644 index 0000000..281493f --- /dev/null +++ b/roles/openclaw_enterprise/templates/openclaw-gateway-profile.service.j2 @@ -0,0 +1,34 @@ +[Unit] +Description=OpenClaw Gateway ({{ profile.name }}) +After=network-online.target docker.service +Wants=network-online.target +Requires=docker.service + +[Service] +Type=simple +User={{ openclaw_user }} +Group={{ openclaw_user }} +WorkingDirectory={{ openclaw_home }} + +EnvironmentFile={{ openclaw_enterprise_secret_dir }}/{{ profile.name }}.env +Environment="PNPM_HOME={{ openclaw_home }}/.local/share/pnpm" +Environment="PATH={{ openclaw_home }}/.local/bin:{{ openclaw_home }}/.local/share/pnpm:/usr/local/bin:/usr/bin:/bin" +Environment="HOME={{ openclaw_home }}" +Environment="XDG_RUNTIME_DIR=/run/user/{{ openclaw_uid_value | default('1000') }}" + +ExecStart=/bin/sh {{ openclaw_enterprise_openclaw_bin }} --profile {{ profile.name }} gateway --bind {{ profile.gateway_bind | default('loopback') }} --port {{ profile.gateway_port }} +Restart=always +RestartSec=5 +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=false +ReadWritePaths={{ profile.state_dir | default(openclaw_home ~ '/.openclaw-' ~ profile.name) }} +ReadWritePaths={{ openclaw_home }}/.local + +StandardOutput=journal +StandardError=journal +SyslogIdentifier=openclaw-{{ profile.name }} + +[Install] +WantedBy=multi-user.target diff --git a/roles/openclaw_enterprise/templates/openclaw-profile.json.j2 b/roles/openclaw_enterprise/templates/openclaw-profile.json.j2 new file mode 100644 index 0000000..58ec8d3 --- /dev/null +++ b/roles/openclaw_enterprise/templates/openclaw-profile.json.j2 @@ -0,0 +1,58 @@ +{ + "gateway": { + "mode": "local", + "port": {{ profile.gateway_port }}, + "bind": "{{ profile.gateway_bind | default('loopback') }}", + "auth": { + "mode": "token", + "token": "${OPENCLAW_GATEWAY_TOKEN}" + } + }, + "session": { + "dmScope": "{{ profile.dm_scope | default('per-account-channel-peer') }}" + }, + "agents": { + "defaults": { + "workspace": "{{ profile.workspace_root | default((profile.state_dir | default(openclaw_home ~ '/.openclaw-' ~ profile.name)) ~ '/workspace') }}", + "maxConcurrent": {{ profile.max_concurrent | default(4) }}, + "contextTokens": {{ profile.context_tokens | default(200000) }}, + "models": {{ (profile.model_catalog | default(openclaw_enterprise_default_models)) | to_nice_json(indent=6) }}, + "model": { + "primary": "{{ profile.model_primary | default('anthropic/claude-sonnet-4-5') }}", + "fallbacks": {{ profile.model_fallbacks | default(['openai/gpt-5.2', 'openai/gpt-5-mini']) | to_json }} + }, + "sandbox": { + "mode": "{{ profile.sandbox_mode | default('non-main') }}", + "scope": "{{ profile.sandbox_scope | default('session') }}" + } + }, + "list": {{ profile.agents | default([]) | to_nice_json(indent=4) }} + }, + "tools": { + "profile": "{{ profile.tools_profile | default('coding') }}" + }, + "bindings": {{ profile.bindings | default([]) | to_nice_json(indent=2) }}, +{% if profile.browser is defined %} + "browser": {{ profile.browser | to_nice_json(indent=2) }}, +{% endif %} + "auth": { + "profiles": {{ profile.auth_profiles | default({ + 'openai:primary': {'provider': 'openai', 'mode': 'api_key'}, + 'openai:secondary': {'provider': 'openai', 'mode': 'api_key'}, + 'anthropic:primary': {'provider': 'anthropic', 'mode': 'api_key'}, + 'anthropic:secondary': {'provider': 'anthropic', 'mode': 'api_key'} + }) | to_nice_json(indent=4) }}, + "order": {{ profile.auth_order | default({ + 'openai': ['openai:primary', 'openai:secondary'], + 'anthropic': ['anthropic:primary', 'anthropic:secondary'] + }) | to_nice_json(indent=4) }} + }, + "logging": { + "level": "{{ profile.logging_level | default('info') }}", + "consoleLevel": "{{ profile.console_level | default('info') }}", + "redactSensitive": "tools" + }, + "diagnostics": { + "enabled": {{ profile.diagnostics_enabled | default(true) | to_json }} + } +} diff --git a/roles/openclaw_enterprise/templates/profile.env.j2 b/roles/openclaw_enterprise/templates/profile.env.j2 new file mode 100644 index 0000000..12da597 --- /dev/null +++ b/roles/openclaw_enterprise/templates/profile.env.j2 @@ -0,0 +1,10 @@ +OPENCLAW_PROFILE={{ profile.name }} +OPENCLAW_STATE_DIR={{ profile.state_dir | default(openclaw_home ~ '/.openclaw-' ~ profile.name) }} +OPENCLAW_CONFIG_PATH={{ profile.config_path | default((profile.state_dir | default(openclaw_home ~ '/.openclaw-' ~ profile.name)) ~ '/openclaw.json') }} +OPENCLAW_GATEWAY_PORT={{ profile.gateway_port }} +OPENCLAW_GATEWAY_TOKEN={{ profile.env.OPENCLAW_GATEWAY_TOKEN | default('') }} +OPENAI_API_KEY={{ profile.env.OPENAI_API_KEY | default('') }} +ANTHROPIC_API_KEY={{ profile.env.ANTHROPIC_API_KEY | default('') }} +{% for k, v in (profile.env_extra | default({})).items() %} +{{ k }}={{ v }} +{% endfor %} diff --git a/run-enterprise-playbook.sh b/run-enterprise-playbook.sh new file mode 100755 index 0000000..d096b0f --- /dev/null +++ b/run-enterprise-playbook.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +set -euo pipefail + +ENVIRONMENT=${1:-dev} +INVENTORY="inventories/${ENVIRONMENT}/hosts.yml" +PLAYBOOK="playbooks/enterprise.yml" +VAULT_VARS_FILE="inventories/${ENVIRONMENT}/group_vars/vault.yml" + +if [[ ! -f "$INVENTORY" ]]; then + echo "Inventory not found: $INVENTORY" >&2 + echo "Usage: $0 [ansible extra args...]" >&2 + exit 1 +fi + +shift || true + +EXTRA_ARGS=("$@") + +if [[ -f "$VAULT_VARS_FILE" ]]; then + EXTRA_ARGS+=("-e" "@${VAULT_VARS_FILE}") +fi + +ansible-playbook -i "$INVENTORY" "$PLAYBOOK" --ask-become-pass "${EXTRA_ARGS[@]}" From 6239e8450d409f465c6993dda7e7622ab913afba Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 12:15:37 -0300 Subject: [PATCH 02/16] fix(control-plane): make deploy stable on dev host and pass smoke checks --- control-plane/Dockerfile | 1 + control-plane/package.json | 1 + control-plane/src/common/nats.ts | 46 ++++++++++++++++--- inventories/dev/group_vars/all.yml | 12 ++--- playbooks/control-plane-only.yml | 9 ++++ .../openclaw_control_plane/defaults/main.yml | 4 ++ roles/openclaw_control_plane/tasks/main.yml | 12 ++--- .../openclaw_control_plane/tasks/profile.yml | 28 +++++------ .../templates/docker-compose.full.yml.j2 | 3 ++ 9 files changed, 84 insertions(+), 32 deletions(-) create mode 100644 playbooks/control-plane-only.yml diff --git a/control-plane/Dockerfile b/control-plane/Dockerfile index e40cbb9..e53d274 100644 --- a/control-plane/Dockerfile +++ b/control-plane/Dockerfile @@ -6,6 +6,7 @@ RUN npm install --omit=optional FROM node:22-bookworm-slim AS build WORKDIR /app COPY --from=deps /app/node_modules ./node_modules +COPY package.json ./ COPY tsconfig.json tsconfig.build.json ./ COPY src ./src RUN npm run build diff --git a/control-plane/package.json b/control-plane/package.json index 6d6a671..278c453 100644 --- a/control-plane/package.json +++ b/control-plane/package.json @@ -27,6 +27,7 @@ }, "devDependencies": { "@types/node": "^22.13.8", + "@types/pg": "^8.11.10", "typescript": "^5.7.3" } } diff --git a/control-plane/src/common/nats.ts b/control-plane/src/common/nats.ts index 4176d90..3d6bb19 100644 --- a/control-plane/src/common/nats.ts +++ b/control-plane/src/common/nats.ts @@ -1,9 +1,43 @@ -import { StringCodec, connect, type Consumer, type NatsConnection } from 'nats'; +import { + AckPolicy, + type ConnectionOptions, + DeliverPolicy, + ReplayPolicy, + RetentionPolicy, + StringCodec, + connect, + type Consumer, + type NatsConnection +} from 'nats'; const sc = StringCodec(); export async function connectNats(servers: string): Promise { - return connect({ servers: servers.split(',').map((v) => v.trim()) }); + const options: ConnectionOptions = { + servers: [] + }; + + options.servers = servers + .split(',') + .map((v) => v.trim()) + .filter((v) => v.length > 0) + .map((raw) => { + const url = new URL(raw.includes('://') ? raw : `nats://${raw}`); + + if (url.username && options.user === undefined) { + options.user = decodeURIComponent(url.username); + } + if (url.password && options.pass === undefined) { + options.pass = decodeURIComponent(url.password); + } + + url.username = ''; + url.password = ''; + + return `${url.protocol}//${url.host}`; + }); + + return connect(options); } export async function ensureStream(nc: NatsConnection, streamName: string): Promise { @@ -15,7 +49,7 @@ export async function ensureStream(nc: NatsConnection, streamName: string): Prom await jsm.streams.add({ name: streamName, subjects: ['tasks.>', 'results.>', 'control.>'], - retention: 'limits', + retention: RetentionPolicy.Limits, max_age: 7 * 24 * 60 * 60 * 1_000_000_000 }); } @@ -34,11 +68,11 @@ export async function ensureConsumer( } catch { await jsm.consumers.add(streamName, { durable_name: durableName, - ack_policy: 'explicit', - deliver_policy: 'all', + ack_policy: AckPolicy.Explicit, + deliver_policy: DeliverPolicy.All, filter_subject: filterSubject, max_ack_pending: 200, - replay_policy: 'instant' + replay_policy: ReplayPolicy.Instant }); } diff --git a/inventories/dev/group_vars/all.yml b/inventories/dev/group_vars/all.yml index b6f9846..0d75e35 100644 --- a/inventories/dev/group_vars/all.yml +++ b/inventories/dev/group_vars/all.yml @@ -72,7 +72,7 @@ openclaw_control_plane_profiles: - name: efra-core mode: full gateway_profile: dev-main - project_dir: /opt/openclaw/control-plane/efra-core + project_dir: /home/efra/openclaw-control-plane/efra-core ingress_port: 30101 control_api_port: 39101 grafana_port: 31001 @@ -80,18 +80,18 @@ openclaw_control_plane_profiles: uptime_kuma_port: 31081 telegram_bot_token: "{{ vault_telegram_bot_token_efra_core | default('') }}" telegram_default_chat_id: "{{ vault_telegram_default_chat_id_efra_core | default('') }}" - postgres_password: "{{ vault_openclaw_cp_postgres_password_efra_core | default('') }}" + postgres_password: "{{ vault_openclaw_cp_postgres_password_efra_core | default('efra-core-postgres-local') }}" nats_user: queue - nats_password: "{{ vault_openclaw_cp_nats_password_efra_core | default('') }}" + nats_password: "{{ vault_openclaw_cp_nats_password_efra_core | default('efra-core-nats-local') }}" - name: andrea mode: lite gateway_profile: andrea - project_dir: /opt/openclaw/control-plane/andrea + project_dir: /home/efra/openclaw-control-plane/andrea ingress_port: 30111 control_api_port: 39111 telegram_bot_token: "{{ vault_telegram_bot_token_andrea | default('') }}" telegram_default_chat_id: "{{ vault_telegram_default_chat_id_andrea | default('') }}" - postgres_password: "{{ vault_openclaw_cp_postgres_password_andrea | default('') }}" + postgres_password: "{{ vault_openclaw_cp_postgres_password_andrea | default('andrea-postgres-local') }}" nats_user: queue - nats_password: "{{ vault_openclaw_cp_nats_password_andrea | default('') }}" + nats_password: "{{ vault_openclaw_cp_nats_password_andrea | default('andrea-nats-local') }}" diff --git a/playbooks/control-plane-only.yml b/playbooks/control-plane-only.yml new file mode 100644 index 0000000..5d5a642 --- /dev/null +++ b/playbooks/control-plane-only.yml @@ -0,0 +1,9 @@ +--- +- name: Deploy only OpenClaw control-plane stacks + hosts: openclaw_gateway + become: false + vars: + ansible_python_interpreter: /usr/bin/python3 + roles: + - role: openclaw_control_plane + when: openclaw_control_plane_enabled | bool diff --git a/roles/openclaw_control_plane/defaults/main.yml b/roles/openclaw_control_plane/defaults/main.yml index 33bd5c3..c40e780 100644 --- a/roles/openclaw_control_plane/defaults/main.yml +++ b/roles/openclaw_control_plane/defaults/main.yml @@ -6,6 +6,10 @@ openclaw_control_plane_runtime_root: /opt/openclaw/control-plane openclaw_control_plane_source_dir: "{{ playbook_dir }}/../control-plane" openclaw_control_plane_project_prefix: ocp openclaw_control_plane_default_nats_stream: OPENCLAW_TASKS +openclaw_control_plane_owner: "{{ ansible_user | default('openclaw') }}" +openclaw_control_plane_group: "{{ ansible_user | default('openclaw') }}" +openclaw_control_plane_env_owner: "{{ openclaw_control_plane_owner }}" +openclaw_control_plane_env_group: "{{ openclaw_control_plane_group }}" # Modes: # - full: all services (router + broker + workers + observability) diff --git a/roles/openclaw_control_plane/tasks/main.yml b/roles/openclaw_control_plane/tasks/main.yml index bdca0fb..254a54a 100644 --- a/roles/openclaw_control_plane/tasks/main.yml +++ b/roles/openclaw_control_plane/tasks/main.yml @@ -43,24 +43,24 @@ ansible.builtin.file: path: "{{ openclaw_control_plane_runtime_root }}" state: directory - owner: "{{ openclaw_user }}" - group: "{{ openclaw_user }}" + owner: "{{ openclaw_control_plane_owner }}" + group: "{{ openclaw_control_plane_group }}" mode: '0755' - name: Ensure control-plane source directory exists on target ansible.builtin.file: path: "{{ openclaw_control_plane_runtime_root }}/source" state: directory - owner: "{{ openclaw_user }}" - group: "{{ openclaw_user }}" + owner: "{{ openclaw_control_plane_owner }}" + group: "{{ openclaw_control_plane_group }}" mode: '0755' - name: Sync control-plane source to target ansible.builtin.copy: src: "{{ openclaw_control_plane_source_dir }}/" dest: "{{ openclaw_control_plane_runtime_root }}/source/" - owner: "{{ openclaw_user }}" - group: "{{ openclaw_user }}" + owner: "{{ openclaw_control_plane_owner }}" + group: "{{ openclaw_control_plane_group }}" mode: preserve - name: Configure and deploy each control-plane profile diff --git a/roles/openclaw_control_plane/tasks/profile.yml b/roles/openclaw_control_plane/tasks/profile.yml index 51eef68..5f268a5 100644 --- a/roles/openclaw_control_plane/tasks/profile.yml +++ b/roles/openclaw_control_plane/tasks/profile.yml @@ -7,16 +7,16 @@ ansible.builtin.file: path: "{{ openclaw_control_plane_profile_dir }}" state: directory - owner: "{{ openclaw_user }}" - group: "{{ openclaw_user }}" + owner: "{{ openclaw_control_plane_owner }}" + group: "{{ openclaw_control_plane_group }}" mode: '0755' - name: Ensure profile project directories exist ansible.builtin.file: path: "{{ openclaw_control_plane_profile_dir }}/{{ item }}" state: directory - owner: "{{ openclaw_user }}" - group: "{{ openclaw_user }}" + owner: "{{ openclaw_control_plane_owner }}" + group: "{{ openclaw_control_plane_group }}" mode: '0755' loop: - data @@ -29,8 +29,8 @@ ansible.builtin.template: src: control-plane.env.j2 dest: "{{ openclaw_control_plane_profile_dir }}/.env" - owner: root - group: "{{ openclaw_user }}" + owner: "{{ openclaw_control_plane_env_owner }}" + group: "{{ openclaw_control_plane_env_group }}" mode: '0640' no_log: true @@ -38,8 +38,8 @@ ansible.builtin.template: src: docker-compose.full.yml.j2 dest: "{{ openclaw_control_plane_profile_dir }}/docker-compose.yml" - owner: "{{ openclaw_user }}" - group: "{{ openclaw_user }}" + owner: "{{ openclaw_control_plane_owner }}" + group: "{{ openclaw_control_plane_group }}" mode: '0644' when: profile.mode == 'full' @@ -47,8 +47,8 @@ ansible.builtin.template: src: docker-compose.lite.yml.j2 dest: "{{ openclaw_control_plane_profile_dir }}/docker-compose.yml" - owner: "{{ openclaw_user }}" - group: "{{ openclaw_user }}" + owner: "{{ openclaw_control_plane_owner }}" + group: "{{ openclaw_control_plane_group }}" mode: '0644' when: profile.mode == 'lite' @@ -56,16 +56,16 @@ ansible.builtin.template: src: prometheus.yml.j2 dest: "{{ openclaw_control_plane_profile_dir }}/prometheus/prometheus.yml" - owner: "{{ openclaw_user }}" - group: "{{ openclaw_user }}" + owner: "{{ openclaw_control_plane_owner }}" + group: "{{ openclaw_control_plane_group }}" mode: '0644' - name: Render Grafana datasource provisioning ansible.builtin.template: src: grafana-datasources.yml.j2 dest: "{{ openclaw_control_plane_profile_dir }}/grafana/provisioning/datasources/datasource.yml" - owner: "{{ openclaw_user }}" - group: "{{ openclaw_user }}" + owner: "{{ openclaw_control_plane_owner }}" + group: "{{ openclaw_control_plane_group }}" mode: '0644' - name: Deploy control-plane stack with Docker Compose diff --git a/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 b/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 index 3e66e7d..110d5b8 100644 --- a/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 +++ b/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 @@ -148,8 +148,10 @@ services: prometheus: image: prom/prometheus:v2.54.1 + user: "0:0" command: - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - ./data/prometheus:/prometheus @@ -163,6 +165,7 @@ services: grafana: image: grafana/grafana:11.2.2 + user: "0:0" environment: GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD} volumes: From 558448d30625731ebe8a2efc2d5f6786a773b545 Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 12:32:05 -0300 Subject: [PATCH 03/16] feat(ops): add make-driven backup/purge/install/oauth/smoke workflow --- Makefile | 48 +++++++++++++++++++++++++++++++++ README.md | 27 +++++++++++++++++++ docs/operations-workflow.md | 44 ++++++++++++++++++++++++++++++ ops/backup.sh | 52 ++++++++++++++++++++++++++++++++++++ ops/common.sh | 39 +++++++++++++++++++++++++++ ops/install.sh | 42 +++++++++++++++++++++++++++++ ops/oauth-login.sh | 19 +++++++++++++ ops/purge.sh | 53 +++++++++++++++++++++++++++++++++++++ ops/smoke.sh | 53 +++++++++++++++++++++++++++++++++++++ 9 files changed, 377 insertions(+) create mode 100644 Makefile create mode 100644 docs/operations-workflow.md create mode 100755 ops/backup.sh create mode 100755 ops/common.sh create mode 100755 ops/install.sh create mode 100755 ops/oauth-login.sh create mode 100755 ops/purge.sh create mode 100755 ops/smoke.sh diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a220b6a --- /dev/null +++ b/Makefile @@ -0,0 +1,48 @@ +SHELL := /usr/bin/env bash + +.DEFAULT_GOAL := help + +ENV ?= dev +INVENTORY ?= inventories/$(ENV)/hosts.yml +LIMIT ?= zennook +PROFILES ?= dev-main andrea +OAUTH_PROVIDER ?= openai-codex + +.PHONY: help backup purge install oauth-login smoke reinstall + +help: + @echo "OpenClaw Ops Targets" + @echo "" + @echo " make backup Backup current OpenClaw + control-plane state" + @echo " make purge CONFIRM=1 Purge deployed state and containers" + @echo " make install Install/reconcile enterprise + control-plane" + @echo " make oauth-login Run interactive OAuth login per profile" + @echo " make smoke Run post-install smoke checks" + @echo " make reinstall CONFIRM=1 backup + purge + install + smoke" + @echo "" + @echo "Variables:" + @echo " ENV=$(ENV) INVENTORY=$(INVENTORY) LIMIT=$(LIMIT)" + @echo " PROFILES='$(PROFILES)' OAUTH_PROVIDER=$(OAUTH_PROVIDER)" + +backup: + @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/backup.sh + +purge: + @if [[ "$(CONFIRM)" != "1" ]]; then echo "Use: make purge CONFIRM=1"; exit 1; fi + @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/purge.sh --yes + +install: + @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/install.sh + +oauth-login: + @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" PROFILES="$(PROFILES)" OAUTH_PROVIDER="$(OAUTH_PROVIDER)" ./ops/oauth-login.sh + +smoke: + @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/smoke.sh + +reinstall: + @if [[ "$(CONFIRM)" != "1" ]]; then echo "Use: make reinstall CONFIRM=1"; exit 1; fi + @$(MAKE) backup ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" + @$(MAKE) purge CONFIRM=1 ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" + @$(MAKE) install ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" + @$(MAKE) smoke ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" diff --git a/README.md b/README.md index d4bd7ca..a69588d 100644 --- a/README.md +++ b/README.md @@ -154,9 +154,36 @@ ansible-playbook playbook.yml --ask-become-pass - [Technical Details](docs/architecture.md) - Architecture overview - [Enterprise Deployment](docs/enterprise-deployment.md) - Multi-profile deployment - [Stage 2 Control Plane](docs/control-plane-stage2.md) - NATS + NestJS full/lite package +- [Operations Workflow](docs/operations-workflow.md) - Backup/purge/install with Makefile - [Troubleshooting](docs/troubleshooting.md) - Common issues - [Agent Guidelines](AGENTS.md) - AI agent instructions +## Operations Workflow (Makefile) + +For repeatable day-2 operations (backup, clean reinstall, smoke checks), use: + +```bash +cd openclaw-ansible + +# Backup current state +make backup + +# Purge runtime state (requires explicit confirmation) +make purge CONFIRM=1 + +# Reinstall enterprise + stage2 control-plane +make install + +# Interactive OAuth for Codex provider +make oauth-login PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex + +# Validate full flow +make smoke + +# One-shot full cycle +make reinstall CONFIRM=1 +``` + ## Requirements - Debian 11+ or Ubuntu 20.04+ or Fedora 40+ diff --git a/docs/operations-workflow.md b/docs/operations-workflow.md new file mode 100644 index 0000000..7a75e2e --- /dev/null +++ b/docs/operations-workflow.md @@ -0,0 +1,44 @@ +--- +title: Operations Workflow (Backup, Purge, Install) +summary: Makefile-driven clean install/uninstall cycle for OpenClaw + Stage 2 control-plane +--- + +# Operations Workflow + +This repository provides a Makefile interface over `ops/*.sh` scripts: + +- `make backup` +- `make purge CONFIRM=1` +- `make install` +- `make oauth-login` +- `make smoke` +- `make reinstall CONFIRM=1` + +## Why this split + +- `Makefile`: stable operator commands. +- `ops/*.sh`: implementation details, safe to extend. + +## OAuth note (Codex) + +`openai-codex` login is interactive by design (browser OAuth callback). +It cannot be made fully non-interactive without changing provider auth semantics. + +Use: + +```bash +make oauth-login PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex +``` + +## Defaults + +- `ENV=dev` +- `INVENTORY=inventories/dev/hosts.yml` +- `LIMIT=zennook` +- `PROFILES="dev-main andrea"` + +Override per command, for example: + +```bash +make install ENV=staging LIMIT=fedora +``` diff --git a/ops/backup.sh b/ops/backup.sh new file mode 100755 index 0000000..c3522fa --- /dev/null +++ b/ops/backup.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ops/common.sh +source "${SCRIPT_DIR}/common.sh" + +need_cmd tar +need_cmd date + +backup_root="${BACKUP_DIR:-${ROOT_DIR}/backups}" +timestamp="$(date +%Y%m%d-%H%M%S)" +archive_path="${backup_root}/openclaw-backup-${timestamp}.tar.gz" + +mkdir -p "${backup_root}" + +candidates=( + "${ROOT_DIR}/inventories/dev/group_vars/all.yml" + "${ROOT_DIR}/inventories/dev/group_vars/vault.yml" + "/etc/openclaw" + "/opt/openclaw/control-plane" + "/home/efra/openclaw-control-plane" + "/home/openclaw/.openclaw" + "/home/openclaw/.openclaw-dev-main" + "/home/openclaw/.openclaw-andrea" + "/home/efra/.openclaw" + "/home/efra/.openclaw-dev-main" + "/home/efra/.openclaw-andrea" + "/home/openclaw/.config/systemd/user/openclaw-gateway-dev-main.service" + "/home/openclaw/.config/systemd/user/openclaw-gateway-andrea.service" + "/home/efra/.config/systemd/user/openclaw-gateway-dev-main.service" + "/home/efra/.config/systemd/user/openclaw-gateway-andrea.service" +) + +existing=() +for path in "${candidates[@]}"; do + if run_sudo test -e "${path}"; then + existing+=("${path}") + fi +done + +(( ${#existing[@]} > 0 )) || die "No known OpenClaw paths found to backup." + +log "Creating backup archive: ${archive_path}" +run_sudo tar -czf "${archive_path}" "${existing[@]}" + +if [[ "$(id -u)" -ne 0 ]]; then + run_sudo chown "$(id -u):$(id -g)" "${archive_path}" || true +fi + +log "Backup completed with ${#existing[@]} paths." +log "Archive ready: ${archive_path}" diff --git a/ops/common.sh b/ops/common.sh new file mode 100755 index 0000000..b430ab0 --- /dev/null +++ b/ops/common.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +log() { + printf '[ops] %s\n' "$*" +} + +die() { + printf '[ops] ERROR: %s\n' "$*" >&2 + exit 1 +} + +need_cmd() { + local cmd="$1" + command -v "$cmd" >/dev/null 2>&1 || die "Missing command: $cmd" +} + +run_sudo() { + if [[ "$(id -u)" -eq 0 ]]; then + "$@" + else + sudo "$@" + fi +} + +resolve_inventory() { + local env_name="${ENV:-dev}" + printf '%s' "${INVENTORY:-${ROOT_DIR}/inventories/${env_name}/hosts.yml}" +} + +resolve_limit() { + printf '%s' "${LIMIT:-zennook}" +} + +resolve_ansible_bin() { + printf '%s' "${ANSIBLE_PLAYBOOK_BIN:-ansible-playbook}" +} diff --git a/ops/install.sh b/ops/install.sh new file mode 100755 index 0000000..fe8da6d --- /dev/null +++ b/ops/install.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ops/common.sh +source "${SCRIPT_DIR}/common.sh" + +ansible_bin="$(resolve_ansible_bin)" +inventory_file="$(resolve_inventory)" +limit_host="$(resolve_limit)" + +need_cmd "${ansible_bin}" + +[[ -f "${inventory_file}" ]] || die "Inventory not found: ${inventory_file}" + +extra_args=() +if [[ -n "${ANSIBLE_EXTRA_ARGS:-}" ]]; then + # shellcheck disable=SC2206 + extra_args=( ${ANSIBLE_EXTRA_ARGS} ) +fi + +log "Running enterprise install (inventory=${inventory_file}, limit=${limit_host})." +"${ansible_bin}" \ + -i "${inventory_file}" \ + "${ROOT_DIR}/playbooks/enterprise.yml" \ + -l "${limit_host}" \ + --become \ + -e openclaw_control_plane_enabled=true \ + -e openclaw_control_plane_manage_stack=true \ + "${extra_args[@]}" + +log "Running control-plane reconciliation playbook." +"${ansible_bin}" \ + -i "${inventory_file}" \ + "${ROOT_DIR}/playbooks/control-plane-only.yml" \ + -l "${limit_host}" \ + --become \ + -e openclaw_control_plane_enabled=true \ + -e openclaw_control_plane_manage_stack=true \ + "${extra_args[@]}" + +log "Install completed." diff --git a/ops/oauth-login.sh b/ops/oauth-login.sh new file mode 100755 index 0000000..0193756 --- /dev/null +++ b/ops/oauth-login.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ops/common.sh +source "${SCRIPT_DIR}/common.sh" + +provider="${OAUTH_PROVIDER:-openai-codex}" +profiles_raw="${PROFILES:-dev-main andrea}" + +log "Starting interactive OAuth login for provider=${provider} profiles=${profiles_raw}" + +for profile in ${profiles_raw}; do + log "OAuth login for profile=${profile}" + run_sudo -u openclaw -H bash -lc \ + "/home/openclaw/.local/bin/openclaw --profile '${profile}' models auth login --provider '${provider}'" +done + +log "OAuth login flow completed for all profiles." diff --git a/ops/purge.sh b/ops/purge.sh new file mode 100755 index 0000000..9528b10 --- /dev/null +++ b/ops/purge.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ops/common.sh +source "${SCRIPT_DIR}/common.sh" + +need_cmd docker + +if [[ "${1:-}" != "--yes" ]]; then + die "This command is destructive. Re-run with: ./ops/purge.sh --yes" +fi + +log "Stopping/removing control-plane compose stacks (efra-core, andrea)." +for profile in efra-core andrea; do + compose_file="/home/efra/openclaw-control-plane/${profile}/docker-compose.yml" + project_name="ocp-${profile}" + if run_sudo test -f "${compose_file}"; then + run_sudo docker compose -f "${compose_file}" -p "${project_name}" down --remove-orphans --volumes || true + fi +done + +log "Stopping known OpenClaw gateway services (best effort)." +for user_name in openclaw efra; do + if id "${user_name}" >/dev/null 2>&1; then + run_sudo -u "${user_name}" bash -lc \ + "systemctl --user stop openclaw-gateway-dev-main.service openclaw-gateway-andrea.service >/dev/null 2>&1 || true" + fi +done + +run_sudo pkill -f "openclaw-gateway" || true + +purge_paths=( + "/opt/openclaw/control-plane" + "/home/efra/openclaw-control-plane" + "/home/openclaw/.openclaw" + "/home/openclaw/.openclaw-dev-main" + "/home/openclaw/.openclaw-andrea" + "/home/efra/.openclaw" + "/home/efra/.openclaw-dev-main" + "/home/efra/.openclaw-andrea" +) + +log "Removing OpenClaw runtime/state directories." +for path in "${purge_paths[@]}"; do + if run_sudo test -e "${path}"; then + run_sudo rm -rf "${path}" + log "Removed: ${path}" + fi +done + +log "Purge complete." +log "Note: /etc/openclaw was intentionally preserved (secrets/config bootstrap)." diff --git a/ops/smoke.sh b/ops/smoke.sh new file mode 100755 index 0000000..688f082 --- /dev/null +++ b/ops/smoke.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ops/common.sh +source "${SCRIPT_DIR}/common.sh" + +need_cmd curl +need_cmd docker +need_cmd sed +need_cmd grep + +check_url() { + local url="$1" + run_sudo curl -fsS "${url}" >/dev/null + log "Health OK: ${url}" +} + +simulate_and_assert() { + local ingress_port="$1" + local control_port="$2" + local profile_label="$3" + local payload resp task_id tasks_json + + payload=$(cat </dev/null +run_sudo docker compose -f /home/efra/openclaw-control-plane/andrea/docker-compose.yml -p ocp-andrea ps >/dev/null + +check_url "http://127.0.0.1:39101/health" +check_url "http://127.0.0.1:30101/health" +check_url "http://127.0.0.1:39111/health" +check_url "http://127.0.0.1:30111/health" + +simulate_and_assert 30101 39101 "efra-core" +simulate_and_assert 30111 39111 "andrea" + +log "Smoke checks completed successfully." From 97bbe9526ab0d18efe1877e9930c421ea6003ebf Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 12:44:07 -0300 Subject: [PATCH 04/16] feat(cloudflare): add tunnel role and subdomain exposure workflow --- Makefile | 6 +- README.md | 4 + docs/cloudflare-tunnel.md | 61 +++++++ docs/control-plane-stage2.md | 3 + docs/enterprise-deployment.md | 5 + docs/operations-workflow.md | 7 + ...STORY-205-cloudflare-subdomain-exposure.md | 20 +++ inventories/README.md | 9 ++ inventories/dev/group_vars/all.yml | 24 +++ inventories/dev/group_vars/vault.example.yml | 5 + ops/cloudflare-reconcile.sh | 29 ++++ playbooks/cloudflare-only.yml | 9 ++ playbooks/enterprise.yml | 4 + .../defaults/main.yml | 31 ++++ .../openclaw_cloudflare_tunnel/tasks/main.yml | 149 ++++++++++++++++++ .../templates/cloudflared-config.yml.j2 | 9 ++ .../templates/cloudflared.service.j2 | 21 +++ 17 files changed, 395 insertions(+), 1 deletion(-) create mode 100644 docs/cloudflare-tunnel.md create mode 100644 docs/stories/STORY-205-cloudflare-subdomain-exposure.md create mode 100755 ops/cloudflare-reconcile.sh create mode 100644 playbooks/cloudflare-only.yml create mode 100644 roles/openclaw_cloudflare_tunnel/defaults/main.yml create mode 100644 roles/openclaw_cloudflare_tunnel/tasks/main.yml create mode 100644 roles/openclaw_cloudflare_tunnel/templates/cloudflared-config.yml.j2 create mode 100644 roles/openclaw_cloudflare_tunnel/templates/cloudflared.service.j2 diff --git a/Makefile b/Makefile index a220b6a..0a407ca 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ LIMIT ?= zennook PROFILES ?= dev-main andrea OAUTH_PROVIDER ?= openai-codex -.PHONY: help backup purge install oauth-login smoke reinstall +.PHONY: help backup purge install cloudflare oauth-login smoke reinstall help: @echo "OpenClaw Ops Targets" @@ -16,6 +16,7 @@ help: @echo " make backup Backup current OpenClaw + control-plane state" @echo " make purge CONFIRM=1 Purge deployed state and containers" @echo " make install Install/reconcile enterprise + control-plane" + @echo " make cloudflare Reconcile Cloudflare tunnel/service only" @echo " make oauth-login Run interactive OAuth login per profile" @echo " make smoke Run post-install smoke checks" @echo " make reinstall CONFIRM=1 backup + purge + install + smoke" @@ -34,6 +35,9 @@ purge: install: @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/install.sh +cloudflare: + @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/cloudflare-reconcile.sh + oauth-login: @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" PROFILES="$(PROFILES)" OAUTH_PROVIDER="$(OAUTH_PROVIDER)" ./ops/oauth-login.sh diff --git a/README.md b/README.md index a69588d..ec93259 100644 --- a/README.md +++ b/README.md @@ -154,6 +154,7 @@ ansible-playbook playbook.yml --ask-become-pass - [Technical Details](docs/architecture.md) - Architecture overview - [Enterprise Deployment](docs/enterprise-deployment.md) - Multi-profile deployment - [Stage 2 Control Plane](docs/control-plane-stage2.md) - NATS + NestJS full/lite package +- [Cloudflare Tunnel Exposure](docs/cloudflare-tunnel.md) - Subdomain publishing for local services - [Operations Workflow](docs/operations-workflow.md) - Backup/purge/install with Makefile - [Troubleshooting](docs/troubleshooting.md) - Common issues - [Agent Guidelines](AGENTS.md) - AI agent instructions @@ -174,6 +175,9 @@ make purge CONFIRM=1 # Reinstall enterprise + stage2 control-plane make install +# Reconcile only Cloudflare tunnel/service (if enabled in inventory) +make cloudflare + # Interactive OAuth for Codex provider make oauth-login PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex diff --git a/docs/cloudflare-tunnel.md b/docs/cloudflare-tunnel.md new file mode 100644 index 0000000..51728cc --- /dev/null +++ b/docs/cloudflare-tunnel.md @@ -0,0 +1,61 @@ +--- +title: Cloudflare Tunnel Exposure +summary: Publish OpenClaw and control-plane local services via Cloudflare subdomains +--- + +# Cloudflare Tunnel Exposure + +This role ports the same model used in `/home/efra/develop/cloudflare-tunnel` into Ansible so +subdomain exposure can be reconciled together with enterprise deployment. + +## What it manages + +- Installs `cloudflared` (Debian/Ubuntu). +- Writes tunnel credentials and `config.yml`. +- Installs/starts a dedicated systemd service: + - `cloudflared-.service` +- Optionally reconciles DNS CNAME records with: + - `cloudflared tunnel route dns` + +## Inventory variables + +Set in `inventories//group_vars/all.yml`: + +```yaml +openclaw_cloudflare_tunnel_enabled: true +openclaw_cloudflare_tunnel_name: "zennook-openclaw" +openclaw_cloudflare_tunnel_run_user: efra +openclaw_cloudflare_tunnel_run_group: efra +openclaw_cloudflare_tunnel_id: "{{ vault_cloudflare_tunnel_id }}" +openclaw_cloudflare_tunnel_credentials_json: "{{ vault_cloudflare_tunnel_credentials_json }}" +openclaw_cloudflare_tunnel_manage_dns: false +openclaw_cloudflare_tunnel_ingress: + - hostname: "efra-core-ingress.example.com" + service: "http://127.0.0.1:30101" + - hostname: "efra-core-control.example.com" + service: "http://127.0.0.1:39101" +``` + +Set secrets in `inventories//group_vars/vault.yml`: + +```yaml +vault_cloudflare_tunnel_id: "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" +vault_cloudflare_tunnel_credentials_json: | + {"AccountTag":"...","TunnelSecret":"...","TunnelID":"...","TunnelName":"..."} +``` + +## Run + +```bash +# Full enterprise reconcile (includes cloudflare role when enabled) +make install ENV=dev LIMIT=zennook + +# Cloudflare-only reconcile +make cloudflare ENV=dev LIMIT=zennook +``` + +## Notes + +- The role assumes the tunnel already exists in Cloudflare. +- `openclaw_cloudflare_tunnel_manage_dns` is off by default to avoid accidental DNS writes. +- If DNS route reconcile is needed, first ensure `cloudflared tunnel login` was performed on the host. diff --git a/docs/control-plane-stage2.md b/docs/control-plane-stage2.md index 3b80cfe..52e50e4 100644 --- a/docs/control-plane-stage2.md +++ b/docs/control-plane-stage2.md @@ -67,6 +67,9 @@ Secrets (`inventories//group_vars/vault.yml`): - Grafana (`full` only): `http://127.0.0.1:` - Prometheus (`full` only): `http://127.0.0.1:` +You can publish these loopback endpoints through Cloudflare Tunnel subdomains by enabling +`openclaw_cloudflare_tunnel_*` variables in inventory (see `docs/cloudflare-tunnel.md`). + ## Packaging for other profiles To install this package on another profile, add one object to `openclaw_control_plane_profiles`. diff --git a/docs/enterprise-deployment.md b/docs/enterprise-deployment.md index f927b82..ad110d5 100644 --- a/docs/enterprise-deployment.md +++ b/docs/enterprise-deployment.md @@ -13,12 +13,14 @@ This repository now includes an enterprise deployment path with: - Multi-provider, multi-model defaults (OpenAI + Anthropic) - Secret isolation via per-profile `EnvironmentFile` - Stage 2 control-plane package (`full`/`lite`) with NATS + NestJS routing +- Optional Cloudflare Tunnel role for subdomain exposure of loopback services ## Files - Playbook: `playbooks/enterprise.yml` - Role: `roles/openclaw_enterprise` - Stage 2 role: `roles/openclaw_control_plane` +- Cloudflare role: `roles/openclaw_cloudflare_tunnel` - Stage 2 services source: `control-plane/` - Inventories: `inventories//...` @@ -64,6 +66,8 @@ Store credentials in Ansible Vault and reference them from `inventories/*/group_ - `vault_openclaw_gateway_token_*` - `vault_openai_api_key_*` (optional when using OAuth/browser auth) - `vault_anthropic_api_key_*` (optional when using OAuth/browser auth) +- `vault_cloudflare_tunnel_id` (optional when Cloudflare role is enabled) +- `vault_cloudflare_tunnel_credentials_json` (optional when Cloudflare role is enabled) The role writes `/etc/openclaw/secrets/.env` with mode `0640`, owner `root`, group `openclaw`. @@ -136,3 +140,4 @@ Then sign in manually in the managed browser profile. Do not share credentials w - Existing `playbook.yml` is unchanged for one-command installs. - Use `playbooks/enterprise.yml` for multi-node production topology. - Stage 2 queue orchestration and telemetry details: `docs/control-plane-stage2.md`. +- Cloudflare exposure details: `docs/cloudflare-tunnel.md`. diff --git a/docs/operations-workflow.md b/docs/operations-workflow.md index 7a75e2e..8655305 100644 --- a/docs/operations-workflow.md +++ b/docs/operations-workflow.md @@ -10,6 +10,7 @@ This repository provides a Makefile interface over `ops/*.sh` scripts: - `make backup` - `make purge CONFIRM=1` - `make install` +- `make cloudflare` - `make oauth-login` - `make smoke` - `make reinstall CONFIRM=1` @@ -30,6 +31,12 @@ Use: make oauth-login PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex ``` +Cloudflare reconcile (subdomain exposure only): + +```bash +make cloudflare ENV=dev LIMIT=zennook +``` + ## Defaults - `ENV=dev` diff --git a/docs/stories/STORY-205-cloudflare-subdomain-exposure.md b/docs/stories/STORY-205-cloudflare-subdomain-exposure.md new file mode 100644 index 0000000..a06fbd5 --- /dev/null +++ b/docs/stories/STORY-205-cloudflare-subdomain-exposure.md @@ -0,0 +1,20 @@ +# STORY-205 - Cloudflare subdomain exposure + +Status: Done + +## Acceptance Criteria + +- Cloudflare Tunnel can be managed from Ansible as an optional role. +- Ingress routes map subdomains to loopback services (ingress/control-api/grafana/etc.). +- Operators can run a dedicated Cloudflare reconcile command without re-deploying all stacks. + +## Evidence + +- `roles/openclaw_cloudflare_tunnel/defaults/main.yml` +- `roles/openclaw_cloudflare_tunnel/tasks/main.yml` +- `roles/openclaw_cloudflare_tunnel/templates/cloudflared-config.yml.j2` +- `roles/openclaw_cloudflare_tunnel/templates/cloudflared.service.j2` +- `playbooks/cloudflare-only.yml` +- `ops/cloudflare-reconcile.sh` +- `Makefile` +- `docs/cloudflare-tunnel.md` diff --git a/inventories/README.md b/inventories/README.md index 22c37ce..b4cee74 100644 --- a/inventories/README.md +++ b/inventories/README.md @@ -23,6 +23,15 @@ Stage 2 control-plane knobs: - `openclaw_control_plane_manage_stack` - `openclaw_control_plane_profiles` (`mode: full|lite`) +Cloudflare subdomain exposure knobs: + +- `openclaw_cloudflare_tunnel_enabled` +- `openclaw_cloudflare_tunnel_name` +- `openclaw_cloudflare_tunnel_id` +- `openclaw_cloudflare_tunnel_credentials_json` (from vault) +- `openclaw_cloudflare_tunnel_ingress` (list of `hostname` + `service`) +- `openclaw_cloudflare_tunnel_manage_dns` (optional route reconcile) + Example: ```bash diff --git a/inventories/dev/group_vars/all.yml b/inventories/dev/group_vars/all.yml index 0d75e35..d09089b 100644 --- a/inventories/dev/group_vars/all.yml +++ b/inventories/dev/group_vars/all.yml @@ -95,3 +95,27 @@ openclaw_control_plane_profiles: postgres_password: "{{ vault_openclaw_cp_postgres_password_andrea | default('andrea-postgres-local') }}" nats_user: queue nats_password: "{{ vault_openclaw_cp_nats_password_andrea | default('andrea-nats-local') }}" + +# Optional: expose local loopback services via Cloudflare Tunnel subdomains. +# Keep disabled until tunnel credentials and public domain are configured in vault.yml. +openclaw_cloudflare_domain: "example.com" +openclaw_cloudflare_tunnel_enabled: false +openclaw_cloudflare_tunnel_name: "zennook-openclaw" +openclaw_cloudflare_tunnel_run_user: efra +openclaw_cloudflare_tunnel_run_group: efra +openclaw_cloudflare_tunnel_id: "{{ vault_cloudflare_tunnel_id | default('') }}" +openclaw_cloudflare_tunnel_credentials_json: "{{ vault_cloudflare_tunnel_credentials_json | default('') }}" +openclaw_cloudflare_tunnel_manage_dns: false +openclaw_cloudflare_tunnel_ingress: + - hostname: "dev-main-dashboard.{{ openclaw_cloudflare_domain }}" + service: "http://127.0.0.1:19011" + - hostname: "efra-core-ingress.{{ openclaw_cloudflare_domain }}" + service: "http://127.0.0.1:30101" + - hostname: "efra-core-control.{{ openclaw_cloudflare_domain }}" + service: "http://127.0.0.1:39101" + - hostname: "efra-core-grafana.{{ openclaw_cloudflare_domain }}" + service: "http://127.0.0.1:31001" + - hostname: "andrea-ingress.{{ openclaw_cloudflare_domain }}" + service: "http://127.0.0.1:30111" + - hostname: "andrea-control.{{ openclaw_cloudflare_domain }}" + service: "http://127.0.0.1:39111" diff --git a/inventories/dev/group_vars/vault.example.yml b/inventories/dev/group_vars/vault.example.yml index 1601d33..70a6588 100644 --- a/inventories/dev/group_vars/vault.example.yml +++ b/inventories/dev/group_vars/vault.example.yml @@ -14,3 +14,8 @@ vault_openclaw_cp_postgres_password_andrea: "replace-with-strong-password" vault_openclaw_cp_nats_password_andrea: "replace-with-strong-password" vault_telegram_bot_token_andrea: "replace-with-bot-token" vault_telegram_default_chat_id_andrea: "replace-with-chat-id" + +# Cloudflare Tunnel secrets +vault_cloudflare_tunnel_id: "replace-with-tunnel-uuid" +vault_cloudflare_tunnel_credentials_json: | + {"AccountTag":"replace","TunnelSecret":"replace","TunnelID":"replace","TunnelName":"replace"} diff --git a/ops/cloudflare-reconcile.sh b/ops/cloudflare-reconcile.sh new file mode 100755 index 0000000..591a7b0 --- /dev/null +++ b/ops/cloudflare-reconcile.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ops/common.sh +source "${SCRIPT_DIR}/common.sh" + +ansible_bin="$(resolve_ansible_bin)" +inventory_file="$(resolve_inventory)" +limit_host="$(resolve_limit)" + +need_cmd "${ansible_bin}" +[[ -f "${inventory_file}" ]] || die "Inventory not found: ${inventory_file}" + +extra_args=() +if [[ -n "${ANSIBLE_EXTRA_ARGS:-}" ]]; then + # shellcheck disable=SC2206 + extra_args=( ${ANSIBLE_EXTRA_ARGS} ) +fi + +log "Reconciling Cloudflare tunnel (inventory=${inventory_file}, limit=${limit_host})." +"${ansible_bin}" \ + -i "${inventory_file}" \ + "${ROOT_DIR}/playbooks/cloudflare-only.yml" \ + -l "${limit_host}" \ + --become \ + "${extra_args[@]}" + +log "Cloudflare reconcile completed." diff --git a/playbooks/cloudflare-only.yml b/playbooks/cloudflare-only.yml new file mode 100644 index 0000000..a26c168 --- /dev/null +++ b/playbooks/cloudflare-only.yml @@ -0,0 +1,9 @@ +--- +- name: Reconcile Cloudflare tunnel exposure + hosts: openclaw_gateway + become: true + vars: + ansible_python_interpreter: /usr/bin/python3 + roles: + - role: openclaw_cloudflare_tunnel + when: openclaw_cloudflare_tunnel_enabled | bool diff --git a/playbooks/enterprise.yml b/playbooks/enterprise.yml index 2da1b33..2e0033e 100644 --- a/playbooks/enterprise.yml +++ b/playbooks/enterprise.yml @@ -30,3 +30,7 @@ when: - openclaw_control_plane_enabled | bool - ansible_distribution in ['Debian', 'Ubuntu', 'Fedora'] + - role: openclaw_cloudflare_tunnel + when: + - openclaw_cloudflare_tunnel_enabled | bool + - ansible_distribution in ['Debian', 'Ubuntu', 'Fedora'] diff --git a/roles/openclaw_cloudflare_tunnel/defaults/main.yml b/roles/openclaw_cloudflare_tunnel/defaults/main.yml new file mode 100644 index 0000000..49dd358 --- /dev/null +++ b/roles/openclaw_cloudflare_tunnel/defaults/main.yml @@ -0,0 +1,31 @@ +--- +openclaw_cloudflare_tunnel_enabled: false +openclaw_cloudflare_tunnel_require_supported_os: true + +openclaw_cloudflare_tunnel_name: "openclaw-{{ inventory_hostname }}" +openclaw_cloudflare_tunnel_service_name: "cloudflared-{{ openclaw_cloudflare_tunnel_name }}" + +openclaw_cloudflare_tunnel_run_user: "{{ ansible_user | default('openclaw') }}" +openclaw_cloudflare_tunnel_run_group: "{{ ansible_user | default('openclaw') }}" + +openclaw_cloudflare_tunnel_workdir: "/etc/openclaw/cloudflare/{{ openclaw_cloudflare_tunnel_name }}" +openclaw_cloudflare_tunnel_config_path: "{{ openclaw_cloudflare_tunnel_workdir }}/config.yml" +openclaw_cloudflare_tunnel_credentials_file: >- + {{ openclaw_cloudflare_tunnel_workdir }}/{{ openclaw_cloudflare_tunnel_id }}.json + +openclaw_cloudflare_tunnel_id: "" +openclaw_cloudflare_tunnel_credentials_json: "" +openclaw_cloudflare_tunnel_metrics_port: 40500 + +# List of public hostnames to expose through this tunnel. +# Example: +# openclaw_cloudflare_tunnel_ingress: +# - hostname: ingress.example.com +# service: http://127.0.0.1:30101 +# - hostname: grafana.example.com +# service: http://127.0.0.1:31001 +openclaw_cloudflare_tunnel_ingress: [] + +# Optional one-time DNS reconcile. +openclaw_cloudflare_tunnel_manage_dns: false +openclaw_cloudflare_tunnel_dns_tunnel_name: "{{ openclaw_cloudflare_tunnel_name }}" diff --git a/roles/openclaw_cloudflare_tunnel/tasks/main.yml b/roles/openclaw_cloudflare_tunnel/tasks/main.yml new file mode 100644 index 0000000..b018d1f --- /dev/null +++ b/roles/openclaw_cloudflare_tunnel/tasks/main.yml @@ -0,0 +1,149 @@ +--- +- name: Validate OS support for Cloudflare Tunnel role + ansible.builtin.assert: + that: + - ansible_distribution in ['Debian', 'Ubuntu'] + fail_msg: >- + openclaw_cloudflare_tunnel currently supports Debian/Ubuntu hosts only. + Set openclaw_cloudflare_tunnel_enabled=false on this host or extend the role for this OS. + when: + - openclaw_cloudflare_tunnel_enabled | bool + - openclaw_cloudflare_tunnel_require_supported_os | bool + +- name: Validate required Cloudflare Tunnel settings + ansible.builtin.assert: + that: + - openclaw_cloudflare_tunnel_name | length > 0 + - openclaw_cloudflare_tunnel_id | length > 0 + - openclaw_cloudflare_tunnel_credentials_json | length > 0 + - openclaw_cloudflare_tunnel_ingress | length > 0 + fail_msg: >- + Set openclaw_cloudflare_tunnel_id, openclaw_cloudflare_tunnel_credentials_json + and at least one openclaw_cloudflare_tunnel_ingress route before enabling the role. + when: openclaw_cloudflare_tunnel_enabled | bool + +- name: Validate Cloudflare ingress route schema + ansible.builtin.assert: + that: + - route.hostname is defined + - route.hostname | length > 0 + - route.service is defined + - route.service | length > 0 + fail_msg: "Each ingress route requires non-empty hostname and service." + loop: "{{ openclaw_cloudflare_tunnel_ingress }}" + loop_control: + loop_var: route + when: openclaw_cloudflare_tunnel_enabled | bool + +- name: Resolve Cloudflare apt codename + ansible.builtin.set_fact: + openclaw_cloudflare_tunnel_apt_codename: >- + {{ 'bookworm' if ansible_distribution_release == 'trixie' else ansible_distribution_release }} + when: + - openclaw_cloudflare_tunnel_enabled | bool + - ansible_distribution in ['Debian', 'Ubuntu'] + +- name: Install Cloudflare Tunnel prerequisites + ansible.builtin.apt: + name: + - ca-certificates + - curl + - lsb-release + state: present + update_cache: true + when: + - openclaw_cloudflare_tunnel_enabled | bool + - ansible_distribution in ['Debian', 'Ubuntu'] + +- name: Install Cloudflare GPG key + ansible.builtin.get_url: + url: https://pkg.cloudflare.com/cloudflare-main.gpg + dest: /usr/share/keyrings/cloudflare-main.gpg + mode: '0644' + when: + - openclaw_cloudflare_tunnel_enabled | bool + - ansible_distribution in ['Debian', 'Ubuntu'] + +- name: Configure Cloudflare apt repository + ansible.builtin.apt_repository: + repo: >- + deb [signed-by=/usr/share/keyrings/cloudflare-main.gpg] + https://pkg.cloudflare.com/cloudflared {{ openclaw_cloudflare_tunnel_apt_codename }} main + filename: cloudflared + state: present + when: + - openclaw_cloudflare_tunnel_enabled | bool + - ansible_distribution in ['Debian', 'Ubuntu'] + +- name: Install cloudflared package + ansible.builtin.apt: + name: cloudflared + state: present + update_cache: true + when: + - openclaw_cloudflare_tunnel_enabled | bool + - ansible_distribution in ['Debian', 'Ubuntu'] + +- name: Ensure Cloudflare tunnel working directory exists + ansible.builtin.file: + path: "{{ openclaw_cloudflare_tunnel_workdir }}" + state: directory + owner: "{{ openclaw_cloudflare_tunnel_run_user }}" + group: "{{ openclaw_cloudflare_tunnel_run_group }}" + mode: '0750' + when: openclaw_cloudflare_tunnel_enabled | bool + +- name: Write Cloudflare tunnel credentials file + ansible.builtin.copy: + dest: "{{ openclaw_cloudflare_tunnel_credentials_file }}" + content: "{{ openclaw_cloudflare_tunnel_credentials_json }}\n" + owner: "{{ openclaw_cloudflare_tunnel_run_user }}" + group: "{{ openclaw_cloudflare_tunnel_run_group }}" + mode: '0600' + no_log: true + when: openclaw_cloudflare_tunnel_enabled | bool + +- name: Render Cloudflare tunnel config + ansible.builtin.template: + src: cloudflared-config.yml.j2 + dest: "{{ openclaw_cloudflare_tunnel_config_path }}" + owner: "{{ openclaw_cloudflare_tunnel_run_user }}" + group: "{{ openclaw_cloudflare_tunnel_run_group }}" + mode: '0640' + when: openclaw_cloudflare_tunnel_enabled | bool + +- name: Install Cloudflare tunnel systemd service unit + ansible.builtin.template: + src: cloudflared.service.j2 + dest: "/etc/systemd/system/{{ openclaw_cloudflare_tunnel_service_name }}.service" + owner: root + group: root + mode: '0644' + when: openclaw_cloudflare_tunnel_enabled | bool + +- name: Ensure Cloudflare tunnel service is enabled and running + ansible.builtin.systemd: + name: "{{ openclaw_cloudflare_tunnel_service_name }}" + daemon_reload: true + enabled: true + state: started + when: openclaw_cloudflare_tunnel_enabled | bool + +- name: Reconcile Cloudflare DNS routes for ingress hostnames + ansible.builtin.command: + cmd: >- + cloudflared tunnel route dns + {{ openclaw_cloudflare_tunnel_dns_tunnel_name }} + {{ route.hostname }} + register: openclaw_cloudflare_dns_route + changed_when: "'Added CNAME' in (openclaw_cloudflare_dns_route.stdout | default(''))" + failed_when: >- + openclaw_cloudflare_dns_route.rc != 0 and + ('already exists' not in ((openclaw_cloudflare_dns_route.stdout | default('')) | lower)) and + ('already exists' not in ((openclaw_cloudflare_dns_route.stderr | default('')) | lower)) + loop: "{{ openclaw_cloudflare_tunnel_ingress }}" + loop_control: + loop_var: route + when: + - openclaw_cloudflare_tunnel_enabled | bool + - openclaw_cloudflare_tunnel_manage_dns | bool diff --git a/roles/openclaw_cloudflare_tunnel/templates/cloudflared-config.yml.j2 b/roles/openclaw_cloudflare_tunnel/templates/cloudflared-config.yml.j2 new file mode 100644 index 0000000..7595ebc --- /dev/null +++ b/roles/openclaw_cloudflare_tunnel/templates/cloudflared-config.yml.j2 @@ -0,0 +1,9 @@ +tunnel: {{ openclaw_cloudflare_tunnel_id }} +credentials-file: {{ openclaw_cloudflare_tunnel_credentials_file }} +metrics: 127.0.0.1:{{ openclaw_cloudflare_tunnel_metrics_port }} +ingress: +{% for route in openclaw_cloudflare_tunnel_ingress %} + - hostname: {{ route.hostname }} + service: {{ route.service }} +{% endfor %} + - service: http_status:404 diff --git a/roles/openclaw_cloudflare_tunnel/templates/cloudflared.service.j2 b/roles/openclaw_cloudflare_tunnel/templates/cloudflared.service.j2 new file mode 100644 index 0000000..4f9abff --- /dev/null +++ b/roles/openclaw_cloudflare_tunnel/templates/cloudflared.service.j2 @@ -0,0 +1,21 @@ +[Unit] +Description=Cloudflare Tunnel ({{ openclaw_cloudflare_tunnel_name }}) +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User={{ openclaw_cloudflare_tunnel_run_user }} +Group={{ openclaw_cloudflare_tunnel_run_group }} +WorkingDirectory={{ openclaw_cloudflare_tunnel_workdir }} +ExecStart=/usr/bin/cloudflared tunnel --config {{ openclaw_cloudflare_tunnel_config_path }} run +Restart=always +RestartSec=5 +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=full +ProtectHome=true +ReadWritePaths={{ openclaw_cloudflare_tunnel_workdir }} + +[Install] +WantedBy=multi-user.target From 50d9bbb71ba45dcf482a218b9a0847a2c666241a Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 12:46:52 -0300 Subject: [PATCH 05/16] fix(cloudflare): reuse existing develop tunnel credentials and local1 routes --- docs/cloudflare-tunnel.md | 7 +++ inventories/README.md | 2 + inventories/dev/group_vars/all.yml | 17 ++++--- .../defaults/main.yml | 1 + .../openclaw_cloudflare_tunnel/tasks/main.yml | 51 +++++++++++++++++-- 5 files changed, 67 insertions(+), 11 deletions(-) diff --git a/docs/cloudflare-tunnel.md b/docs/cloudflare-tunnel.md index 51728cc..789cae0 100644 --- a/docs/cloudflare-tunnel.md +++ b/docs/cloudflare-tunnel.md @@ -36,6 +36,13 @@ openclaw_cloudflare_tunnel_ingress: service: "http://127.0.0.1:39101" ``` +If you already have tunnel credentials on host (like `/home/efra/.cloudflared/.json`): + +```yaml +openclaw_cloudflare_tunnel_manage_credentials_file: false +openclaw_cloudflare_tunnel_credentials_file: "/home/efra/.cloudflared/.json" +``` + Set secrets in `inventories//group_vars/vault.yml`: ```yaml diff --git a/inventories/README.md b/inventories/README.md index b4cee74..b454156 100644 --- a/inventories/README.md +++ b/inventories/README.md @@ -29,6 +29,8 @@ Cloudflare subdomain exposure knobs: - `openclaw_cloudflare_tunnel_name` - `openclaw_cloudflare_tunnel_id` - `openclaw_cloudflare_tunnel_credentials_json` (from vault) +- `openclaw_cloudflare_tunnel_manage_credentials_file` (set false to reuse existing credential file) +- `openclaw_cloudflare_tunnel_credentials_file` (host path to existing `.json`) - `openclaw_cloudflare_tunnel_ingress` (list of `hostname` + `service`) - `openclaw_cloudflare_tunnel_manage_dns` (optional route reconcile) diff --git a/inventories/dev/group_vars/all.yml b/inventories/dev/group_vars/all.yml index d09089b..7d19167 100644 --- a/inventories/dev/group_vars/all.yml +++ b/inventories/dev/group_vars/all.yml @@ -97,17 +97,20 @@ openclaw_control_plane_profiles: nats_password: "{{ vault_openclaw_cp_nats_password_andrea | default('andrea-nats-local') }}" # Optional: expose local loopback services via Cloudflare Tunnel subdomains. -# Keep disabled until tunnel credentials and public domain are configured in vault.yml. -openclaw_cloudflare_domain: "example.com" -openclaw_cloudflare_tunnel_enabled: false -openclaw_cloudflare_tunnel_name: "zennook-openclaw" +# This host reuses existing credentials file from /home/efra/.cloudflared. +openclaw_cloudflare_domain: "hegga.cl" +openclaw_cloudflare_tunnel_enabled: "{{ inventory_hostname == 'zennook' }}" +openclaw_cloudflare_tunnel_name: "local1" openclaw_cloudflare_tunnel_run_user: efra openclaw_cloudflare_tunnel_run_group: efra -openclaw_cloudflare_tunnel_id: "{{ vault_cloudflare_tunnel_id | default('') }}" -openclaw_cloudflare_tunnel_credentials_json: "{{ vault_cloudflare_tunnel_credentials_json | default('') }}" +openclaw_cloudflare_tunnel_id: "dab4c78a-4d51-43b1-957e-42f6aecbe10a" +openclaw_cloudflare_tunnel_manage_credentials_file: false +openclaw_cloudflare_tunnel_credentials_file: "/home/efra/.cloudflared/dab4c78a-4d51-43b1-957e-42f6aecbe10a.json" openclaw_cloudflare_tunnel_manage_dns: false openclaw_cloudflare_tunnel_ingress: - - hostname: "dev-main-dashboard.{{ openclaw_cloudflare_domain }}" + - hostname: "local1.{{ openclaw_cloudflare_domain }}" + service: "ssh://127.0.0.1:22" + - hostname: "openclaw.{{ openclaw_cloudflare_domain }}" service: "http://127.0.0.1:19011" - hostname: "efra-core-ingress.{{ openclaw_cloudflare_domain }}" service: "http://127.0.0.1:30101" diff --git a/roles/openclaw_cloudflare_tunnel/defaults/main.yml b/roles/openclaw_cloudflare_tunnel/defaults/main.yml index 49dd358..4d91914 100644 --- a/roles/openclaw_cloudflare_tunnel/defaults/main.yml +++ b/roles/openclaw_cloudflare_tunnel/defaults/main.yml @@ -15,6 +15,7 @@ openclaw_cloudflare_tunnel_credentials_file: >- openclaw_cloudflare_tunnel_id: "" openclaw_cloudflare_tunnel_credentials_json: "" +openclaw_cloudflare_tunnel_manage_credentials_file: true openclaw_cloudflare_tunnel_metrics_port: 40500 # List of public hostnames to expose through this tunnel. diff --git a/roles/openclaw_cloudflare_tunnel/tasks/main.yml b/roles/openclaw_cloudflare_tunnel/tasks/main.yml index b018d1f..300b787 100644 --- a/roles/openclaw_cloudflare_tunnel/tasks/main.yml +++ b/roles/openclaw_cloudflare_tunnel/tasks/main.yml @@ -15,13 +15,54 @@ that: - openclaw_cloudflare_tunnel_name | length > 0 - openclaw_cloudflare_tunnel_id | length > 0 - - openclaw_cloudflare_tunnel_credentials_json | length > 0 - openclaw_cloudflare_tunnel_ingress | length > 0 fail_msg: >- - Set openclaw_cloudflare_tunnel_id, openclaw_cloudflare_tunnel_credentials_json - and at least one openclaw_cloudflare_tunnel_ingress route before enabling the role. + Set openclaw_cloudflare_tunnel_id and at least one + openclaw_cloudflare_tunnel_ingress route before enabling the role. when: openclaw_cloudflare_tunnel_enabled | bool +- name: Validate inline credentials when role manages credentials file + ansible.builtin.assert: + that: + - openclaw_cloudflare_tunnel_credentials_json | length > 0 + fail_msg: >- + openclaw_cloudflare_tunnel_manage_credentials_file=true requires + openclaw_cloudflare_tunnel_credentials_json. + when: + - openclaw_cloudflare_tunnel_enabled | bool + - openclaw_cloudflare_tunnel_manage_credentials_file | bool + +- name: Validate external credentials file path when inline write is disabled + ansible.builtin.assert: + that: + - openclaw_cloudflare_tunnel_credentials_file | length > 0 + fail_msg: >- + openclaw_cloudflare_tunnel_manage_credentials_file=false requires + openclaw_cloudflare_tunnel_credentials_file pointing to an existing file. + when: + - openclaw_cloudflare_tunnel_enabled | bool + - not (openclaw_cloudflare_tunnel_manage_credentials_file | bool) + +- name: Check external credentials file exists + ansible.builtin.stat: + path: "{{ openclaw_cloudflare_tunnel_credentials_file }}" + register: openclaw_cloudflare_tunnel_credentials_file_stat + when: + - openclaw_cloudflare_tunnel_enabled | bool + - not (openclaw_cloudflare_tunnel_manage_credentials_file | bool) + +- name: Fail when external credentials file is missing + ansible.builtin.assert: + that: + - openclaw_cloudflare_tunnel_credentials_file_stat.stat.exists + - openclaw_cloudflare_tunnel_credentials_file_stat.stat.isreg + fail_msg: >- + Cloudflare credentials file not found at {{ openclaw_cloudflare_tunnel_credentials_file }}. + Either provide this file on host or set openclaw_cloudflare_tunnel_manage_credentials_file=true + with openclaw_cloudflare_tunnel_credentials_json. + when: + - openclaw_cloudflare_tunnel_enabled | bool + - not (openclaw_cloudflare_tunnel_manage_credentials_file | bool) - name: Validate Cloudflare ingress route schema ansible.builtin.assert: that: @@ -101,7 +142,9 @@ group: "{{ openclaw_cloudflare_tunnel_run_group }}" mode: '0600' no_log: true - when: openclaw_cloudflare_tunnel_enabled | bool + when: + - openclaw_cloudflare_tunnel_enabled | bool + - openclaw_cloudflare_tunnel_manage_credentials_file | bool - name: Render Cloudflare tunnel config ansible.builtin.template: From 980d3d9a1e476488c1055942097b949c8c8ba8d2 Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 12:58:49 -0300 Subject: [PATCH 06/16] chore(cloudflare): migrate to dedicated tunnel and remove legacy local1 linkage --- inventories/dev/group_vars/all.yml | 12 +++++------- roles/openclaw_cloudflare_tunnel/defaults/main.yml | 3 ++- .../templates/cloudflared.service.j2 | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/inventories/dev/group_vars/all.yml b/inventories/dev/group_vars/all.yml index 7d19167..79cc28f 100644 --- a/inventories/dev/group_vars/all.yml +++ b/inventories/dev/group_vars/all.yml @@ -97,20 +97,18 @@ openclaw_control_plane_profiles: nats_password: "{{ vault_openclaw_cp_nats_password_andrea | default('andrea-nats-local') }}" # Optional: expose local loopback services via Cloudflare Tunnel subdomains. -# This host reuses existing credentials file from /home/efra/.cloudflared. +# This host uses a dedicated tunnel for openclaw-ansible services. openclaw_cloudflare_domain: "hegga.cl" openclaw_cloudflare_tunnel_enabled: "{{ inventory_hostname == 'zennook' }}" -openclaw_cloudflare_tunnel_name: "local1" +openclaw_cloudflare_tunnel_name: "openclaw-zennook" openclaw_cloudflare_tunnel_run_user: efra openclaw_cloudflare_tunnel_run_group: efra -openclaw_cloudflare_tunnel_id: "dab4c78a-4d51-43b1-957e-42f6aecbe10a" +openclaw_cloudflare_tunnel_id: "68554472-f797-431c-bfa3-42480ea7e5c6" openclaw_cloudflare_tunnel_manage_credentials_file: false -openclaw_cloudflare_tunnel_credentials_file: "/home/efra/.cloudflared/dab4c78a-4d51-43b1-957e-42f6aecbe10a.json" +openclaw_cloudflare_tunnel_credentials_file: "/home/efra/.cloudflared/68554472-f797-431c-bfa3-42480ea7e5c6.json" openclaw_cloudflare_tunnel_manage_dns: false openclaw_cloudflare_tunnel_ingress: - - hostname: "local1.{{ openclaw_cloudflare_domain }}" - service: "ssh://127.0.0.1:22" - - hostname: "openclaw.{{ openclaw_cloudflare_domain }}" + - hostname: "dev-main-dashboard.{{ openclaw_cloudflare_domain }}" service: "http://127.0.0.1:19011" - hostname: "efra-core-ingress.{{ openclaw_cloudflare_domain }}" service: "http://127.0.0.1:30101" diff --git a/roles/openclaw_cloudflare_tunnel/defaults/main.yml b/roles/openclaw_cloudflare_tunnel/defaults/main.yml index 4d91914..a7af97e 100644 --- a/roles/openclaw_cloudflare_tunnel/defaults/main.yml +++ b/roles/openclaw_cloudflare_tunnel/defaults/main.yml @@ -8,7 +8,8 @@ openclaw_cloudflare_tunnel_service_name: "cloudflared-{{ openclaw_cloudflare_tun openclaw_cloudflare_tunnel_run_user: "{{ ansible_user | default('openclaw') }}" openclaw_cloudflare_tunnel_run_group: "{{ ansible_user | default('openclaw') }}" -openclaw_cloudflare_tunnel_workdir: "/etc/openclaw/cloudflare/{{ openclaw_cloudflare_tunnel_name }}" +openclaw_cloudflare_tunnel_workdir: >- + /home/{{ openclaw_cloudflare_tunnel_run_user }}/.openclaw-cloudflare/{{ openclaw_cloudflare_tunnel_name }} openclaw_cloudflare_tunnel_config_path: "{{ openclaw_cloudflare_tunnel_workdir }}/config.yml" openclaw_cloudflare_tunnel_credentials_file: >- {{ openclaw_cloudflare_tunnel_workdir }}/{{ openclaw_cloudflare_tunnel_id }}.json diff --git a/roles/openclaw_cloudflare_tunnel/templates/cloudflared.service.j2 b/roles/openclaw_cloudflare_tunnel/templates/cloudflared.service.j2 index 4f9abff..8f5df0b 100644 --- a/roles/openclaw_cloudflare_tunnel/templates/cloudflared.service.j2 +++ b/roles/openclaw_cloudflare_tunnel/templates/cloudflared.service.j2 @@ -14,7 +14,7 @@ RestartSec=5 NoNewPrivileges=true PrivateTmp=true ProtectSystem=full -ProtectHome=true +ProtectHome=false ReadWritePaths={{ openclaw_cloudflare_tunnel_workdir }} [Install] From 6a46f592cc3b9f75c6fb1b680daa3955085cf58c Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 13:27:39 -0300 Subject: [PATCH 07/16] fix(enterprise): add non-empty gateway token fallbacks for dev-main and andrea --- inventories/dev/group_vars/all.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inventories/dev/group_vars/all.yml b/inventories/dev/group_vars/all.yml index 79cc28f..4339193 100644 --- a/inventories/dev/group_vars/all.yml +++ b/inventories/dev/group_vars/all.yml @@ -39,7 +39,7 @@ openclaw_enterprise_profiles: - browser bindings: [] env: - OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_dev_main | default('') }}" + OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_dev_main | default('tmp-dev-main-cfb2f8fdbd9144a88f1ec5a266') }}" OPENAI_API_KEY: "{{ vault_openai_api_key_dev | default('') }}" ANTHROPIC_API_KEY: "{{ vault_anthropic_api_key_dev | default('') }}" @@ -61,7 +61,7 @@ openclaw_enterprise_profiles: workspace: /home/openclaw/.openclaw-andrea/workspace bindings: [] env: - OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_andrea | default('') }}" + OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_andrea | default('tmp-andrea-7c53f22ab2e24c55bb9a2746ac') }}" OPENAI_API_KEY: "{{ vault_openai_api_key_dev | default('') }}" ANTHROPIC_API_KEY: "{{ vault_anthropic_api_key_dev | default('') }}" From 38dc62072348859c380e91b0c196d2a087760420 Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 14:33:03 -0300 Subject: [PATCH 08/16] fix(ops): auto-load vault vars and harden control-plane deploy logs --- ops/install.sh | 9 ++++++++- roles/openclaw_control_plane/defaults/main.yml | 2 ++ roles/openclaw_control_plane/tasks/main.yml | 2 ++ roles/openclaw_control_plane/tasks/profile.yml | 4 ++-- roles/openclaw_enterprise/tasks/main.yml | 2 ++ 5 files changed, 16 insertions(+), 3 deletions(-) diff --git a/ops/install.sh b/ops/install.sh index fe8da6d..ddbe23e 100755 --- a/ops/install.sh +++ b/ops/install.sh @@ -8,15 +8,22 @@ source "${SCRIPT_DIR}/common.sh" ansible_bin="$(resolve_ansible_bin)" inventory_file="$(resolve_inventory)" limit_host="$(resolve_limit)" +inventory_dir="$(cd "$(dirname "${inventory_file}")" && pwd)" +vault_file="${VAULT_FILE:-${inventory_dir}/group_vars/vault.yml}" need_cmd "${ansible_bin}" [[ -f "${inventory_file}" ]] || die "Inventory not found: ${inventory_file}" extra_args=() +if [[ -f "${vault_file}" ]]; then + extra_args+=( -e "@${vault_file}" ) + log "Including vault variables file: ${vault_file}" +fi + if [[ -n "${ANSIBLE_EXTRA_ARGS:-}" ]]; then # shellcheck disable=SC2206 - extra_args=( ${ANSIBLE_EXTRA_ARGS} ) + extra_args+=( ${ANSIBLE_EXTRA_ARGS} ) fi log "Running enterprise install (inventory=${inventory_file}, limit=${limit_host})." diff --git a/roles/openclaw_control_plane/defaults/main.yml b/roles/openclaw_control_plane/defaults/main.yml index c40e780..bee3fb7 100644 --- a/roles/openclaw_control_plane/defaults/main.yml +++ b/roles/openclaw_control_plane/defaults/main.yml @@ -10,6 +10,8 @@ openclaw_control_plane_owner: "{{ ansible_user | default('openclaw') }}" openclaw_control_plane_group: "{{ ansible_user | default('openclaw') }}" openclaw_control_plane_env_owner: "{{ openclaw_control_plane_owner }}" openclaw_control_plane_env_group: "{{ openclaw_control_plane_group }}" +openclaw_control_plane_health_retries: 20 +openclaw_control_plane_health_delay: 3 # Modes: # - full: all services (router + broker + workers + observability) diff --git a/roles/openclaw_control_plane/tasks/main.yml b/roles/openclaw_control_plane/tasks/main.yml index 254a54a..32ee396 100644 --- a/roles/openclaw_control_plane/tasks/main.yml +++ b/roles/openclaw_control_plane/tasks/main.yml @@ -18,6 +18,7 @@ loop: "{{ openclaw_control_plane_profiles }}" loop_control: loop_var: profile + no_log: true - name: Validate unique control-plane profile names ansible.builtin.assert: @@ -38,6 +39,7 @@ loop: "{{ openclaw_control_plane_profiles }}" loop_control: loop_var: profile + no_log: true - name: Ensure control-plane runtime root exists ansible.builtin.file: diff --git a/roles/openclaw_control_plane/tasks/profile.yml b/roles/openclaw_control_plane/tasks/profile.yml index 5f268a5..8673fa4 100644 --- a/roles/openclaw_control_plane/tasks/profile.yml +++ b/roles/openclaw_control_plane/tasks/profile.yml @@ -85,7 +85,7 @@ method: GET status_code: 200 register: profile_health - retries: 10 - delay: 3 + retries: "{{ openclaw_control_plane_health_retries }}" + delay: "{{ openclaw_control_plane_health_delay }}" until: profile_health is succeeded when: openclaw_control_plane_manage_stack | bool diff --git a/roles/openclaw_enterprise/tasks/main.yml b/roles/openclaw_enterprise/tasks/main.yml index a30aa3c..d1595e6 100644 --- a/roles/openclaw_enterprise/tasks/main.yml +++ b/roles/openclaw_enterprise/tasks/main.yml @@ -16,6 +16,7 @@ loop: "{{ openclaw_enterprise_profiles }}" loop_control: loop_var: profile + no_log: true - name: Validate unique enterprise profile names ansible.builtin.assert: @@ -100,6 +101,7 @@ loop: "{{ openclaw_enterprise_profiles }}" loop_control: loop_var: profile + no_log: true - name: Render enterprise profile config ansible.builtin.template: From e607c6e9e29d511683fdb636c936ef53984b0c0d Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 16:42:39 -0300 Subject: [PATCH 09/16] docs: add detailed operator runbook with mermaid flows --- README.md | 1 + control-plane/src/common/intents.ts | 32 +- control-plane/src/ingress/ingress.service.ts | 50 ++ docs/control-plane-stage2.md | 14 + docs/operator-runbook.md | 472 ++++++++++++++++++ .../templates/docker-compose.full.yml.j2 | 49 ++ 6 files changed, 617 insertions(+), 1 deletion(-) create mode 100644 docs/operator-runbook.md diff --git a/README.md b/README.md index ec93259..f3433d8 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,7 @@ ansible-playbook playbook.yml --ask-become-pass ## Documentation +- [Operator Runbook](docs/operator-runbook.md) - End-to-end profile/agent/OAuth/queue operations guide - [Configuration Guide](docs/configuration.md) - All configuration options - [Development Mode](docs/development-mode.md) - Build from source - [Security Architecture](docs/security.md) - Security details diff --git a/control-plane/src/common/intents.ts b/control-plane/src/common/intents.ts index df827b5..0c18c7d 100644 --- a/control-plane/src/common/intents.ts +++ b/control-plane/src/common/intents.ts @@ -1,17 +1,27 @@ -const ROUTING_RULES: Array<{ intent: string; agent: string; keywords: string[] }> = [ +interface RoutingRule { + intent: string; + agent: string; + description: string; + keywords: string[]; +} + +const ROUTING_RULES: RoutingRule[] = [ { intent: 'browser.login', agent: 'browser-login', + description: 'Login/OAuth and managed browser operations', keywords: ['login', 'browser', 'portal', 'cookie', 'captcha'] }, { intent: 'deploy.coolify', agent: 'coolify-ops', + description: 'Coolify service lifecycle and deployment operations', keywords: ['coolify', 'deploy', 'release', 'rollback', 'service up', 'service down'] }, { intent: 'research.analysis', agent: 'research', + description: 'Research, comparisons, and technical analysis', keywords: ['investiga', 'analiza', 'research', 'comparar', 'resumen', 'benchmark'] } ]; @@ -33,3 +43,23 @@ export function actionNeedsConfirmation(text: string): boolean { lowered.includes(token) ); } + +export function listAvailableAgents(): Array<{ id: string; intent: string; description: string }> { + const agents = new Map(); + + agents.set('main', { + id: 'main', + intent: 'general.main', + description: 'General coordinator and fallback' + }); + + for (const rule of ROUTING_RULES) { + agents.set(rule.agent, { + id: rule.agent, + intent: rule.intent, + description: rule.description + }); + } + + return Array.from(agents.values()); +} diff --git a/control-plane/src/ingress/ingress.service.ts b/control-plane/src/ingress/ingress.service.ts index d2e61e1..6892998 100644 --- a/control-plane/src/ingress/ingress.service.ts +++ b/control-plane/src/ingress/ingress.service.ts @@ -4,6 +4,7 @@ import type { NatsConnection } from 'nats'; import { type AppConfig, loadConfig } from '../common/config'; import { type TaskEnvelope } from '../common/contracts'; +import { listAvailableAgents } from '../common/intents'; import { connectNats, encodeJson, ensureStream } from '../common/nats'; interface TelegramUpdate { @@ -44,6 +45,10 @@ export class IngressService implements OnModuleInit, OnModuleDestroy { throw new Error('Message text is required'); } + if (await this.tryHandleTelegramCommand(text, chatId)) { + return { taskId: `cmd-${randomUUID()}` }; + } + const task: TaskEnvelope = { taskId: randomUUID(), profile: this.cfg.profile, @@ -65,4 +70,49 @@ export class IngressService implements OnModuleInit, OnModuleDestroy { return { taskId: task.taskId }; } + + private async tryHandleTelegramCommand(text: string, chatId: string): Promise { + if (!this.isAgentsCommand(text)) { + return false; + } + + if (!this.cfg.telegramBotToken || !chatId) { + return false; + } + + const agents = listAvailableAgents(); + const lines = [ + `Available agents (${this.cfg.profile}):`, + ...agents.map((agent) => `- ${agent.id}: ${agent.description} [${agent.intent}]`), + '', + 'Usage: send a normal request and the router will select the target by intent.' + ]; + + await this.sendTelegramMessage(chatId, lines.join('\n')); + return true; + } + + private isAgentsCommand(text: string): boolean { + const firstToken = text.trim().split(/\s+/)[0]?.toLowerCase() ?? ''; + return firstToken === '/agents' || firstToken.startsWith('/agents@'); + } + + private async sendTelegramMessage(chatId: string, text: string): Promise { + const response = await fetch( + `https://api.telegram.org/bot${this.cfg.telegramBotToken}/sendMessage`, + { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + chat_id: chatId, + text + }) + } + ); + + if (!response.ok) { + const body = await response.text(); + throw new Error(`Telegram send failed: ${response.status} ${body}`); + } + } } diff --git a/docs/control-plane-stage2.md b/docs/control-plane-stage2.md index 52e50e4..6ec3b04 100644 --- a/docs/control-plane-stage2.md +++ b/docs/control-plane-stage2.md @@ -26,6 +26,10 @@ Router classifies intent and emits `tasks.agent.`. Workers consume per-agent queues and emit `results.agent.`. Broker persists outputs and can send Telegram replies. +Ingress also supports a direct Telegram command: + +- `/agents` (or `/agents@`) to list available agents and intent mappings without queueing a task. + ## Contract Task envelope fields: @@ -74,3 +78,13 @@ You can publish these loopback endpoints through Cloudflare Tunnel subdomains by To install this package on another profile, add one object to `openclaw_control_plane_profiles`. No code changes are required, only profile variables and secrets. + +## Browser worker networking (full mode) + +For browser-driven flows (`browser-login`) the full stack template uses: + +- `network_mode: host` +- `shm_size: "1gb"` +- worker-local `NATS_URL` override to `127.0.0.1:` + +This keeps queue consumption stable while allowing browser-related operations to reach host-local gateway/browser relay paths. diff --git a/docs/operator-runbook.md b/docs/operator-runbook.md new file mode 100644 index 0000000..d2912e9 --- /dev/null +++ b/docs/operator-runbook.md @@ -0,0 +1,472 @@ +--- +title: Operator Runbook (Profiles, Agents, OAuth, Queues) +summary: End-to-end step-by-step guide to deploy, operate, extend, and troubleshoot multi-profile OpenClaw with Stage 2 control-plane. +--- + +# Operator Runbook + +This runbook is the canonical step-by-step guide to: + +- Install and reconcile OpenClaw with Ansible. +- Operate multi-profile gateways (`dev-main`, `andrea`, and new profiles). +- Operate Stage 2 control-plane (`full` and `lite`). +- Login providers (OpenAI Codex OAuth) per profile. +- Create new profiles and new agents safely. +- Route Telegram traffic to agents and validate queue execution. + +Use this with: + +- [Enterprise Deployment](enterprise-deployment.md) +- [Stage 2 Control Plane](control-plane-stage2.md) +- [Operations Workflow](operations-workflow.md) +- [Troubleshooting](troubleshooting.md) + +## 1. Architecture and Responsibilities + +### 1.1 Logical topology + +```mermaid +flowchart LR + A[Ansible control host] --> B[openclaw role] + A --> C[openclaw_enterprise role] + A --> D[openclaw_control_plane role] + A --> E[openclaw_cloudflare_tunnel role] + + C --> F[Gateway profile: dev-main] + C --> G[Gateway profile: andrea] + + D --> H[Control-plane: efra-core full] + D --> I[Control-plane: andrea lite] + + H --> H1[NATS] + H --> H2[Postgres] + H --> H3[Ingress] + H --> H4[Router] + H --> H5[Workers] + H --> H6[Broker] + H --> H7[Control API] + H --> H8[Prometheus/Grafana/Kuma] + + I --> I1[NATS] + I --> I2[Postgres] + I --> I3[Ingress] + I --> I4[Router forced main] + I --> I5[Worker main] + I --> I6[Broker] + I --> I7[Control API] +``` + +### 1.2 Message flow (Telegram to agent and back) + +```mermaid +sequenceDiagram + autonumber + participant TG as Telegram + participant ING as ingress + participant RT as router + participant NATS as NATS JetStream + participant WK as worker- + participant BR as broker + participant PG as PostgreSQL + + TG->>ING: POST /telegram/webhook + ING->>NATS: publish tasks.ingress + RT->>NATS: consume tasks.ingress + RT->>NATS: publish tasks.agent. + WK->>NATS: consume tasks.agent. + WK->>NATS: publish results.agent. + BR->>NATS: consume results.agent.* + BR->>PG: upsert tasks + insert task_events + BR->>TG: sendMessage (final response) +``` + +### 1.3 Component map in this repository + +- Enterprise playbook: `playbooks/enterprise.yml` +- Gateway profile role: `roles/openclaw_enterprise` +- Control-plane role: `roles/openclaw_control_plane` +- Stage 2 source code: `control-plane/` +- Environment inventory: `inventories/dev/group_vars/all.yml` + +## 2. Prerequisites + +- Supported OS on target nodes: Debian, Ubuntu, Fedora. +- Sudo privileges on target node(s). +- `ansible`, `git`, `python3`. +- Telegram Bot token(s) in Vault or inventory variables. +- OpenClaw gateway token per profile (`OPENCLAW_GATEWAY_TOKEN`). + +Optional: + +- Cloudflare tunnel credentials for remote subdomains. +- Tailscale enabled on target nodes. + +## 3. Full Install / Reconcile from Zero + +The recommended operator path is Makefile + `ops/*.sh`. + +```bash +cd /home/efra/openclaw-ansible + +# 1) Backup current runtime +make backup ENV=dev LIMIT=zennook + +# 2) Purge runtime (explicit confirmation required) +make purge CONFIRM=1 ENV=dev LIMIT=zennook + +# 3) Deploy enterprise profiles + control-plane +make install ENV=dev LIMIT=zennook + +# 4) Optional Cloudflare reconcile +make cloudflare ENV=dev LIMIT=zennook + +# 5) OAuth login (interactive browser flow) +make oauth-login ENV=dev LIMIT=zennook PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex + +# 6) Run smoke tests +make smoke ENV=dev LIMIT=zennook +``` + +Equivalent direct Ansible command: + +```bash +ansible-playbook -i inventories/dev/hosts.yml playbooks/enterprise.yml --ask-become-pass --limit zennook +``` + +## 4. Provider Login (OpenAI Codex OAuth) + +### 4.1 Why OAuth per profile + +Auth state is profile-specific (`--profile `). +If you run multiple profiles, log in once for each profile. + +### 4.2 Standard login command + +```bash +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile dev-main models auth login --provider openai-codex +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile andrea models auth login --provider openai-codex +``` + +### 4.3 OAuth flow details + +```mermaid +sequenceDiagram + participant CLI as openclaw CLI + participant B as Browser + participant O as OpenAI OAuth + participant L as localhost:1455 callback + participant P as OpenClaw profile state + + CLI->>B: Open authorize URL + B->>O: User signs in and approves + O->>L: Redirect with auth code + CLI->>P: Store OAuth credentials for profile + CLI-->>CLI: provider ready in this profile +``` + +### 4.4 Verify login + +```bash +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile dev-main status --all +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile dev-main models list +``` + +## 5. Create a New Profile (Gateway + Optional Control-Plane) + +This section describes the exact files and steps. + +### 5.1 Add gateway profile to inventory + +Edit `inventories/dev/group_vars/all.yml` and add an item under `openclaw_enterprise_profiles`: + +```yaml +- name: ops-lab + gateway_port: 19041 + gateway_bind: loopback + state_dir: /home/openclaw/.openclaw-ops-lab + config_path: /home/openclaw/.openclaw-ops-lab/openclaw.json + workspace_root: /home/openclaw/.openclaw-ops-lab/workspace + model_primary: openai/gpt-5-mini + model_fallbacks: + - anthropic/claude-sonnet-4-5 + tools_profile: coding + sandbox_mode: non-main + sandbox_scope: session + agents: + - id: main + default: true + workspace: /home/openclaw/.openclaw-ops-lab/workspace + env: + OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_ops_lab }}" + OPENAI_API_KEY: "" + ANTHROPIC_API_KEY: "" +``` + +### 5.2 Add secrets in Vault + +Edit `inventories/dev/group_vars/vault.yml`: + +```yaml +vault_openclaw_gateway_token_ops_lab: "replace-with-strong-random-token" +``` + +Encrypt if needed: + +```bash +ansible-vault encrypt inventories/dev/group_vars/vault.yml +``` + +### 5.3 (Optional) Add control-plane profile for this gateway profile + +Under `openclaw_control_plane_profiles`: + +```yaml +- name: ops-lab + mode: lite + gateway_profile: ops-lab + project_dir: /home/efra/openclaw-control-plane/ops-lab + ingress_port: 30121 + control_api_port: 39121 + telegram_bot_token: "{{ vault_telegram_bot_token_ops_lab }}" + telegram_default_chat_id: "{{ vault_telegram_default_chat_id_ops_lab }}" + postgres_password: "{{ vault_openclaw_cp_postgres_password_ops_lab }}" + nats_user: queue + nats_password: "{{ vault_openclaw_cp_nats_password_ops_lab }}" +``` + +Vault keys: + +```yaml +vault_telegram_bot_token_ops_lab: "" +vault_telegram_default_chat_id_ops_lab: "" +vault_openclaw_cp_postgres_password_ops_lab: "replace" +vault_openclaw_cp_nats_password_ops_lab: "replace" +``` + +### 5.4 Deploy and verify + +```bash +make install ENV=dev LIMIT=zennook +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile ops-lab status --all +``` + +### 5.5 Login provider for new profile + +```bash +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile ops-lab models auth login --provider openai-codex +``` + +## 6. Create a New Agent Inside an Existing Profile + +### 6.1 Add agent in inventory profile + +Inside the profile `agents:` list in `inventories/dev/group_vars/all.yml`: + +```yaml +- id: qa + workspace: /home/openclaw/.openclaw-dev-main/workspace-qa + tools: + profile: coding +``` + +Redeploy: + +```bash +make install ENV=dev LIMIT=zennook +``` + +### 6.2 Create identity/memory files for the new agent workspace + +```bash +sudo -u openclaw -H install -d -m 700 /home/openclaw/.openclaw-dev-main/workspace-qa/memory + +sudo -u openclaw -H bash -lc 'cat > /home/openclaw/.openclaw-dev-main/workspace-qa/IDENTITY.md < /home/openclaw/.openclaw-dev-main/workspace-qa/MEMORY.md < /home/openclaw/.openclaw-dev-main/workspace-qa/AGENTS.md < `browser-login` keywords: `login`, `browser`, `portal`, `cookie`, `captcha` +- `deploy.coolify` -> `coolify-ops` keywords: `coolify`, `deploy`, `release`, `rollback`, `service up`, `service down` +- `research.analysis` -> `research` keywords: `investiga`, `analiza`, `research`, `comparar`, `resumen`, `benchmark` +- fallback -> `main` + +### 7.3 Browser-login worker networking note + +For browser workflows, `worker-browser-login` uses host networking in full mode template: + +- `network_mode: host` +- `shm_size: "1gb"` +- `NATS_URL` override to loopback-published NATS (`127.0.0.1:14222`) + +This avoids hanging tasks when browser automation needs host-level gateway/browser relay access. + +## 8. Daily Command Reference (Per Profile) + +Replace `` with `dev-main`, `andrea`, or your custom profile. + +### 8.1 Health and status + +```bash +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile status --all +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile doctor --fix +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile security audit --deep +``` + +### 8.2 Gateway lifecycle + +```bash +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile gateway status +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile gateway stop +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile gateway start +``` + +### 8.3 Agent command execution + +```bash +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile agent --agent main --message "hola" --json +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile agent --agent research --message "investiga X" --json +``` + +### 8.4 Onboarding + +```bash +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile onboard --install-daemon +``` + +## 9. Stage 2 Validation Checklist + +```bash +# service health +curl -fsS http://127.0.0.1:30101/health +curl -fsS http://127.0.0.1:39101/health + +# queue stats +curl -fsS http://127.0.0.1:39101/queues + +# simulate ingress +curl -fsS -X POST http://127.0.0.1:30101/ingress/simulate \ + -H "content-type: application/json" \ + -d '{"text":"investiga como te llamas y cual es tu labor","chatId":"local-sim"}' +``` + +Telegram E2E checklist: + +1. Send `/agents` and verify catalog. +2. Send `investiga ...` and verify `[agent=research]`. +3. Send `login ...` and verify `[agent=browser-login]`. +4. Send `coolify deploy ...` and verify `[agent=coolify-ops]`. +5. Send generic `hola` and verify `[agent=main]`. + +## 10. Troubleshooting Matrix + +### 10.1 Missing gateway token + +Symptom: + +- `MissingEnvVarError: Missing env var "OPENCLAW_GATEWAY_TOKEN"` + +Actions: + +1. Check profile env file exists: + - `/etc/openclaw/secrets/.env` +2. Confirm token is present. +3. Re-run: + - `openclaw --profile doctor --fix` + +### 10.2 Wrong sudo syntax + +Symptom: + +- `sudo: unrecognized option '--profile'` + +Cause: + +- `--profile` belongs to `openclaw`, not `sudo`. + +Correct: + +```bash +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile dev-main doctor --fix +``` + +### 10.3 OAuth command says no provider plugins found + +Action: + +1. Ensure correct OpenClaw install/profile. +2. Ensure bundled plugins path exists: + - `/home/openclaw/.openclaw/bundled-extensions` +3. Prefer onboarding wizard for first-time provider setup if plugin state is inconsistent. + +### 10.4 Browser task hangs or no Telegram response + +Actions: + +1. Check worker backlog in NATS (`num_ack_pending` / `num_pending`). +2. Ensure `worker-browser-login` has host networking in full stack template. +3. Ensure browser relay is attached (`tabs > 0`) before screenshot/login tasks. + +### 10.5 Shell completion missing file + +Symptom: + +- `bash: .../completions/openclaw.bash: No existe ...` + +Actions: + +1. `openclaw --profile doctor --fix` +2. If needed, create completion directory/file and fix ownership. + +## 11. Safe Change Process (Recommended) + +When adding profiles/agents: + +1. Update inventory + Vault. +2. `make install`. +3. Run OAuth login for target profiles. +4. Run smoke checks. +5. Validate Telegram E2E. +6. Commit changes with docs + inventory + role/template updates together. + +This keeps code, runtime behavior, and operational docs aligned. diff --git a/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 b/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 index 110d5b8..f6cb4e4 100644 --- a/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 +++ b/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 @@ -3,6 +3,8 @@ services: image: nats:2.10-alpine command: ["-js", "-sd", "/data", "-m", "8222", "--user", "${NATS_USER}", "--pass", "${NATS_PASSWORD}"] restart: unless-stopped + ports: + - "127.0.0.1:{{ profile.nats_host_port | default(14222) }}:4222" volumes: - ./data/nats:/data @@ -72,6 +74,17 @@ services: environment: METRICS_PORT: 9411 WORKER_AGENT_ID: main + WORKER_EXEC_MODE: ${WORKER_EXEC_MODE} + OPENCLAW_BIN: ${OPENCLAW_BIN} + OPENCLAW_HOME: ${OPENCLAW_HOME} + OPENCLAW_ENV_FILE: ${OPENCLAW_ENV_FILE} + OPENCLAW_TIMEOUT_MS: ${OPENCLAW_TIMEOUT_MS} + OPENCLAW_BUNDLED_PLUGINS_DIR: ${OPENCLAW_BUNDLED_PLUGINS_DIR} +{% if profile.worker_exec_mode | default('stub') == 'openclaw' %} + user: "994:994" + volumes: + - /home/openclaw:/home/openclaw +{% endif %} restart: unless-stopped depends_on: - nats @@ -87,6 +100,17 @@ services: environment: METRICS_PORT: 9412 WORKER_AGENT_ID: research + WORKER_EXEC_MODE: ${WORKER_EXEC_MODE} + OPENCLAW_BIN: ${OPENCLAW_BIN} + OPENCLAW_HOME: ${OPENCLAW_HOME} + OPENCLAW_ENV_FILE: ${OPENCLAW_ENV_FILE} + OPENCLAW_TIMEOUT_MS: ${OPENCLAW_TIMEOUT_MS} + OPENCLAW_BUNDLED_PLUGINS_DIR: ${OPENCLAW_BUNDLED_PLUGINS_DIR} +{% if profile.worker_exec_mode | default('stub') == 'openclaw' %} + user: "994:994" + volumes: + - /home/openclaw:/home/openclaw +{% endif %} restart: unless-stopped depends_on: - nats @@ -102,6 +126,20 @@ services: environment: METRICS_PORT: 9413 WORKER_AGENT_ID: browser-login + NATS_URL: nats://{{ profile.nats_user | default('queue') }}:{{ profile.nats_password }}@127.0.0.1:{{ profile.nats_host_port | default(14222) }} + WORKER_EXEC_MODE: ${WORKER_EXEC_MODE} + OPENCLAW_BIN: ${OPENCLAW_BIN} + OPENCLAW_HOME: ${OPENCLAW_HOME} + OPENCLAW_ENV_FILE: ${OPENCLAW_ENV_FILE} + OPENCLAW_TIMEOUT_MS: ${OPENCLAW_TIMEOUT_MS} + OPENCLAW_BUNDLED_PLUGINS_DIR: ${OPENCLAW_BUNDLED_PLUGINS_DIR} +{% if profile.worker_exec_mode | default('stub') == 'openclaw' %} + network_mode: host + shm_size: "1gb" + user: "994:994" + volumes: + - /home/openclaw:/home/openclaw +{% endif %} restart: unless-stopped depends_on: - nats @@ -117,6 +155,17 @@ services: environment: METRICS_PORT: 9414 WORKER_AGENT_ID: coolify-ops + WORKER_EXEC_MODE: ${WORKER_EXEC_MODE} + OPENCLAW_BIN: ${OPENCLAW_BIN} + OPENCLAW_HOME: ${OPENCLAW_HOME} + OPENCLAW_ENV_FILE: ${OPENCLAW_ENV_FILE} + OPENCLAW_TIMEOUT_MS: ${OPENCLAW_TIMEOUT_MS} + OPENCLAW_BUNDLED_PLUGINS_DIR: ${OPENCLAW_BUNDLED_PLUGINS_DIR} +{% if profile.worker_exec_mode | default('stub') == 'openclaw' %} + user: "994:994" + volumes: + - /home/openclaw:/home/openclaw +{% endif %} restart: unless-stopped depends_on: - nats From 821b748d56c0155fb0f41b4aaa7c35b74f01227a Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 16:49:24 -0300 Subject: [PATCH 10/16] feat: complete pending stage2 execution and multi-os automation --- .gitignore | 9 ++ control-plane/src/common/config.ts | 17 ++- control-plane/src/worker/worker.runner.ts | 131 ++++++++++++++++-- docs/troubleshooting.md | 56 ++++++++ inventories/dev/group_vars/all.yml | 11 +- roles/openclaw/tasks/docker-redhat.yml | 50 +++++++ roles/openclaw/tasks/firewall-redhat.yml | 64 +++++++++ roles/openclaw/tasks/main.yml | 44 +++++- roles/openclaw/tasks/nodejs-debian.yml | 71 ++++++++++ roles/openclaw/tasks/nodejs-redhat.yml | 49 +++++++ roles/openclaw/tasks/nodejs.yml | 73 +--------- roles/openclaw/tasks/system-tools-linux.yml | 5 + roles/openclaw/tasks/system-tools-redhat.yml | 55 ++++++++ roles/openclaw/tasks/system-tools.yml | 9 +- roles/openclaw/tasks/tailscale-redhat.yml | 44 ++++++ roles/openclaw/tasks/user.yml | 14 +- .../templates/control-plane.env.j2 | 7 + .../templates/docker-compose.lite.yml.j2 | 11 ++ 18 files changed, 625 insertions(+), 95 deletions(-) create mode 100644 roles/openclaw/tasks/docker-redhat.yml create mode 100644 roles/openclaw/tasks/firewall-redhat.yml create mode 100644 roles/openclaw/tasks/nodejs-debian.yml create mode 100644 roles/openclaw/tasks/nodejs-redhat.yml create mode 100644 roles/openclaw/tasks/system-tools-redhat.yml create mode 100644 roles/openclaw/tasks/tailscale-redhat.yml diff --git a/.gitignore b/.gitignore index de86135..29d7b1e 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,12 @@ vault.yml id_rsa* host_vars/ group_vars/ + +# Keep inventory vars tracked (except vault secrets) +!inventories/ +!inventories/**/ +!inventories/**/group_vars/ +!inventories/**/group_vars/*.yml +!inventories/**/host_vars/ +!inventories/**/host_vars/.gitkeep +inventories/**/group_vars/vault.yml diff --git a/control-plane/src/common/config.ts b/control-plane/src/common/config.ts index e116be9..255dbab 100644 --- a/control-plane/src/common/config.ts +++ b/control-plane/src/common/config.ts @@ -9,6 +9,13 @@ export interface AppConfig { telegramDefaultChatId: string; routerForcedAgent: string; workerAgentId: string; + workerExecMode: string; + openclawBin: string; + openclawHome: string; + openclawEnvFile: string; + openclawGatewayToken: string; + openclawTimeoutMs: number; + openclawBundledPluginsDir: string; } function intFromEnv(name: string, fallback: number): number { @@ -31,6 +38,14 @@ export function loadConfig(serviceName: string): AppConfig { telegramBotToken: process.env.TELEGRAM_BOT_TOKEN ?? '', telegramDefaultChatId: process.env.TELEGRAM_DEFAULT_CHAT_ID ?? '', routerForcedAgent: process.env.ROUTER_FORCED_AGENT ?? '', - workerAgentId: process.env.WORKER_AGENT_ID ?? 'main' + workerAgentId: process.env.WORKER_AGENT_ID ?? 'main', + workerExecMode: process.env.WORKER_EXEC_MODE ?? 'stub', + openclawBin: process.env.OPENCLAW_BIN ?? '/home/openclaw/.local/bin/openclaw', + openclawHome: process.env.OPENCLAW_HOME ?? '/home/openclaw', + openclawEnvFile: process.env.OPENCLAW_ENV_FILE ?? `/etc/openclaw/secrets/${process.env.OPENCLAW_PROFILE ?? 'dev-main'}.env`, + openclawGatewayToken: process.env.OPENCLAW_GATEWAY_TOKEN ?? '', + openclawTimeoutMs: intFromEnv('OPENCLAW_TIMEOUT_MS', 120000), + openclawBundledPluginsDir: + process.env.OPENCLAW_BUNDLED_PLUGINS_DIR ?? '/home/openclaw/.openclaw/bundled-extensions' }; } diff --git a/control-plane/src/worker/worker.runner.ts b/control-plane/src/worker/worker.runner.ts index 97fe4b6..75736bc 100644 --- a/control-plane/src/worker/worker.runner.ts +++ b/control-plane/src/worker/worker.runner.ts @@ -1,4 +1,6 @@ import { Injectable, Logger, OnModuleDestroy, OnModuleInit } from '@nestjs/common'; +import { execFile } from 'node:child_process'; +import { promisify } from 'node:util'; import type { ConsumerMessages, NatsConnection } from 'nats'; import { loadConfig } from '../common/config'; @@ -7,6 +9,8 @@ import { actionNeedsConfirmation } from '../common/intents'; import { type ServiceMetrics, initMetrics, startMetricsServer } from '../common/metrics'; import { connectNats, decodeJson, encodeJson, ensureConsumer, ensureStream } from '../common/nats'; +const execFileAsync = promisify(execFile); + @Injectable() export class WorkerRunner implements OnModuleInit, OnModuleDestroy { private readonly logger = new Logger(WorkerRunner.name); @@ -61,27 +65,134 @@ export class WorkerRunner implements OnModuleInit, OnModuleDestroy { private async processTask(task: TaskEnvelope): Promise { const needsConfirmation = actionNeedsConfirmation(task.text); - const summary = `Task ${task.taskId} routed to ${this.cfg.workerAgentId}`; - const fullResponse = needsConfirmation - ? `Action requires confirmation before execution: ${task.text}` - : `Processed by ${this.cfg.workerAgentId}: ${task.text}`; + if (needsConfirmation) { + return { + taskId: task.taskId, + profile: task.profile, + fromAgent: this.cfg.workerAgentId, + status: 'WAITING_CONFIRMATION', + summary: `Task ${task.taskId} routed to ${this.cfg.workerAgentId}`, + fullResponse: `Action requires confirmation before execution: ${task.text}`, + needsConfirmation: true, + suggestedAction: `confirmar ${task.taskId}`, + tokenUsage: Math.min(300, task.text.length * 2), + costEstimate: 0, + source: task.source, + createdAt: new Date().toISOString() + }; + } - const status: TaskResult['status'] = needsConfirmation ? 'WAITING_CONFIRMATION' : 'DONE'; + if (this.cfg.workerExecMode === 'openclaw') { + return this.processWithOpenClaw(task); + } return { taskId: task.taskId, profile: task.profile, fromAgent: this.cfg.workerAgentId, - status, - summary, - fullResponse, - needsConfirmation, - suggestedAction: needsConfirmation ? `confirmar ${task.taskId}` : undefined, + status: 'DONE', + summary: `Task ${task.taskId} routed to ${this.cfg.workerAgentId}`, + fullResponse: `Processed by ${this.cfg.workerAgentId}: ${task.text}`, + needsConfirmation: false, tokenUsage: Math.min(300, task.text.length * 2), costEstimate: 0, source: task.source, createdAt: new Date().toISOString() }; } + + private async processWithOpenClaw(task: TaskEnvelope): Promise { + const env = { + ...process.env, + HOME: this.cfg.openclawHome, + OPENCLAW_HOME: this.cfg.openclawHome, + OPENCLAW_GATEWAY_TOKEN: this.cfg.openclawGatewayToken, + OPENCLAW_BUNDLED_PLUGINS_DIR: this.cfg.openclawBundledPluginsDir + }; + + try { + const { stdout } = await execFileAsync( + this.cfg.openclawBin, + [ + '--profile', + this.cfg.profile, + 'agent', + '--agent', + this.cfg.workerAgentId, + '--message', + task.text, + '--json' + ], + { + env, + timeout: this.cfg.openclawTimeoutMs, + maxBuffer: 1024 * 1024 + } + ); + + const parsed = this.extractJson(stdout) as Record; + const payloads = Array.isArray(parsed?.payloads) + ? parsed.payloads + : Array.isArray(parsed?.result?.payloads) + ? parsed.result.payloads + : []; + const text = payloads + .map((item: { text?: string } | undefined) => item?.text ?? '') + .filter((line: string) => line.trim().length > 0) + .join('\n') + .trim(); + + const meta = parsed?.meta ?? parsed?.result?.meta ?? {}; + const totalTokens = Number(meta?.agentMeta?.usage?.total ?? 0); + + return { + taskId: task.taskId, + profile: task.profile, + fromAgent: this.cfg.workerAgentId, + status: 'DONE', + summary: `Task ${task.taskId} handled by OpenClaw agent ${this.cfg.workerAgentId}`, + fullResponse: text || `Agent ${this.cfg.workerAgentId} completed with empty text payload.`, + needsConfirmation: false, + tokenUsage: Number.isFinite(totalTokens) && totalTokens > 0 ? totalTokens : undefined, + costEstimate: 0, + source: task.source, + createdAt: new Date().toISOString() + }; + } catch (error) { + const detail = error instanceof Error ? error.message : String(error); + this.logger.warn(`OpenClaw exec failed for ${task.taskId}: ${detail}`); + + return { + taskId: task.taskId, + profile: task.profile, + fromAgent: this.cfg.workerAgentId, + status: 'FAILED', + summary: `Task ${task.taskId} failed in OpenClaw agent ${this.cfg.workerAgentId}`, + fullResponse: `Agent execution failed: ${detail}`, + needsConfirmation: false, + costEstimate: 0, + source: task.source, + createdAt: new Date().toISOString() + }; + } + } + + private extractJson(stdout: string): Record { + const trimmed = stdout.trim(); + if (!trimmed) { + return {}; + } + + try { + return JSON.parse(trimmed) as Record; + } catch { + const start = trimmed.indexOf('{'); + const end = trimmed.lastIndexOf('}'); + if (start >= 0 && end > start) { + return JSON.parse(trimmed.slice(start, end + 1)) as Record; + } + throw new Error('openclaw returned non-JSON output'); + } + } } diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index f4ee2a2..19e605f 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -138,3 +138,59 @@ ansible-playbook playbook.yml --ask-become-pass sudo systemctl start docker # Re-run playbook ``` + +## Gateway Unreachable After Tailscale Exposure Change (dev-main) + +**Symptom**: +- `openclaw --profile dev-main status --all` shows gateway unreachable (`ECONNREFUSED 127.0.0.1:18789`) +- `gateway probe` may show `Connect: ok` but `RPC: failed - timeout` +- Mixed state after switching to `gateway.bind=tailnet` or enabling internal `gateway.tailscale.mode=serve` + +**Root cause**: +- Local profile clients still target loopback (`ws://127.0.0.1:18789`) while gateway binding/exposure was changed. +- Residual `tailscale ssh`/forward processes can remain attached to the service cgroup. +- Internal Tailscale serve from non-interactive service users may fail depending on tailnet policy. + +**Remediation (safe baseline)**: +```bash +# 1) Keep gateway local-only +sudo -iu openclaw /home/openclaw/.local/bin/openclaw --profile dev-main config set gateway.bind loopback +sudo -iu openclaw /home/openclaw/.local/bin/openclaw --profile dev-main config set gateway.tailscale.mode off + +# 2) Restart user service using openclaw user DBus +uid=$(id -u openclaw) +export XDG_RUNTIME_DIR=/run/user/$uid +export DBUS_SESSION_BUS_ADDRESS=unix:path=$XDG_RUNTIME_DIR/bus +sudo -u openclaw XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR DBUS_SESSION_BUS_ADDRESS=$DBUS_SESSION_BUS_ADDRESS \ + systemctl --user restart openclaw-gateway-dev-main.service +sudo -u openclaw XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR DBUS_SESSION_BUS_ADDRESS=$DBUS_SESSION_BUS_ADDRESS \ + systemctl --user enable openclaw-gateway-dev-main.service + +# 3) Validate +sudo -iu openclaw /home/openclaw/.local/bin/openclaw --profile dev-main gateway probe +sudo -iu openclaw /home/openclaw/.local/bin/openclaw --profile dev-main status --all +``` + +**Expected healthy probe**: +- `Local loopback ws://127.0.0.1:18789` +- `Connect: ok` +- `RPC: ok` + +**Expose dashboard over Tailscale (recommended pattern)**: +- Keep OpenClaw bind on loopback. +- Expose separately with Tailscale Serve (HTTPS path), only if Serve is enabled in tailnet admin: +```bash +sudo tailscale serve --bg http://127.0.0.1:18789 +tailscale serve status +``` + +**Security follow-up**: +- If a token appeared in process command lines (`OPENCLAW_GATEWAY_TOKEN=...`), rotate it immediately: +```bash +sudo -iu openclaw /home/openclaw/.local/bin/openclaw --profile dev-main doctor --generate-gateway-token +uid=$(id -u openclaw) +export XDG_RUNTIME_DIR=/run/user/$uid +export DBUS_SESSION_BUS_ADDRESS=unix:path=$XDG_RUNTIME_DIR/bus +sudo -u openclaw XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR DBUS_SESSION_BUS_ADDRESS=$DBUS_SESSION_BUS_ADDRESS \ + systemctl --user restart openclaw-gateway-dev-main.service +``` diff --git a/inventories/dev/group_vars/all.yml b/inventories/dev/group_vars/all.yml index 4339193..0c6ceae 100644 --- a/inventories/dev/group_vars/all.yml +++ b/inventories/dev/group_vars/all.yml @@ -37,9 +37,13 @@ openclaw_enterprise_profiles: profile: full allow: - browser + - id: coolify-ops + workspace: /home/openclaw/.openclaw-dev-main/workspace-coolify-ops + tools: + profile: coding bindings: [] env: - OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_dev_main | default('tmp-dev-main-cfb2f8fdbd9144a88f1ec5a266') }}" + OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_dev_main | default('replace-me-dev-main-gateway-token') }}" OPENAI_API_KEY: "{{ vault_openai_api_key_dev | default('') }}" ANTHROPIC_API_KEY: "{{ vault_anthropic_api_key_dev | default('') }}" @@ -61,7 +65,7 @@ openclaw_enterprise_profiles: workspace: /home/openclaw/.openclaw-andrea/workspace bindings: [] env: - OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_andrea | default('tmp-andrea-7c53f22ab2e24c55bb9a2746ac') }}" + OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_andrea | default('replace-me-andrea-gateway-token') }}" OPENAI_API_KEY: "{{ vault_openai_api_key_dev | default('') }}" ANTHROPIC_API_KEY: "{{ vault_anthropic_api_key_dev | default('') }}" @@ -80,6 +84,9 @@ openclaw_control_plane_profiles: uptime_kuma_port: 31081 telegram_bot_token: "{{ vault_telegram_bot_token_efra_core | default('') }}" telegram_default_chat_id: "{{ vault_telegram_default_chat_id_efra_core | default('') }}" + worker_exec_mode: openclaw + openclaw_env_file: /etc/openclaw/secrets/dev-main.env + openclaw_gateway_token: "{{ vault_openclaw_gateway_token_dev_main | default('replace-me-dev-main-gateway-token') }}" postgres_password: "{{ vault_openclaw_cp_postgres_password_efra_core | default('efra-core-postgres-local') }}" nats_user: queue nats_password: "{{ vault_openclaw_cp_nats_password_efra_core | default('efra-core-nats-local') }}" diff --git a/roles/openclaw/tasks/docker-redhat.yml b/roles/openclaw/tasks/docker-redhat.yml new file mode 100644 index 0000000..ca11dda --- /dev/null +++ b/roles/openclaw/tasks/docker-redhat.yml @@ -0,0 +1,50 @@ +--- +# RedHat/Fedora-specific Docker installation (dnf-based) + +- name: Install required system packages for Docker (RedHat/Fedora) + ansible.builtin.dnf: + name: + - dnf-plugins-core + - ca-certificates + - curl + state: present + update_cache: true + +- name: Add Docker repository (RedHat/Fedora) + ansible.builtin.get_url: + url: "https://download.docker.com/linux/fedora/docker-ce.repo" + dest: /etc/yum.repos.d/docker-ce.repo + owner: root + group: root + mode: '0644' + +- name: Install Docker CE (RedHat/Fedora) + ansible.builtin.dnf: + name: + - docker-ce + - docker-ce-cli + - containerd.io + - docker-buildx-plugin + - docker-compose-plugin + state: present + update_cache: true + +- name: Ensure docker group exists + ansible.builtin.group: + name: docker + state: present + +- name: Ensure Docker service is started and enabled + ansible.builtin.systemd: + name: docker + state: started + enabled: true + +- name: Add user to docker group + ansible.builtin.user: + name: "{{ openclaw_user }}" + groups: docker + append: true + +- name: Reset SSH connection to apply docker group + ansible.builtin.meta: reset_connection diff --git a/roles/openclaw/tasks/firewall-redhat.yml b/roles/openclaw/tasks/firewall-redhat.yml new file mode 100644 index 0000000..64cd7e9 --- /dev/null +++ b/roles/openclaw/tasks/firewall-redhat.yml @@ -0,0 +1,64 @@ +--- +# RedHat/Fedora-specific firewall and security hardening (firewalld) + +- name: Install security packages (RedHat/Fedora) + ansible.builtin.dnf: + name: + - fail2ban + - firewalld + - dnf-automatic + state: present + update_cache: true + +- name: Configure fail2ban for SSH protection + ansible.builtin.copy: + dest: /etc/fail2ban/jail.local + owner: root + group: root + mode: '0644' + content: | + # OpenClaw security hardening - SSH protection + [DEFAULT] + bantime = 3600 + findtime = 600 + maxretry = 5 + backend = systemd + + [sshd] + enabled = true + port = ssh + filter = sshd + notify: Restart fail2ban + +- name: Enable and start fail2ban + ansible.builtin.systemd: + name: fail2ban + state: started + enabled: true + +- name: Enable and start firewalld + ansible.builtin.systemd: + name: firewalld + state: started + enabled: true + +- name: Allow SSH service in firewalld + ansible.posix.firewalld: + service: ssh + state: enabled + permanent: true + immediate: true + +- name: Allow Tailscale UDP port 41641 in firewalld + ansible.posix.firewalld: + port: 41641/udp + state: enabled + permanent: true + immediate: true + when: tailscale_enabled | bool + +- name: Enable automatic update timer (dnf-automatic) + ansible.builtin.systemd: + name: dnf-automatic.timer + state: started + enabled: true diff --git a/roles/openclaw/tasks/main.yml b/roles/openclaw/tasks/main.yml index 81a5dd9..bee5b32 100644 --- a/roles/openclaw/tasks/main.yml +++ b/roles/openclaw/tasks/main.yml @@ -1,21 +1,53 @@ --- +- name: Validate supported OS family for openclaw role + ansible.builtin.assert: + that: + - ansible_os_family in ['Debian', 'RedHat'] + fail_msg: >- + Unsupported OS family '{{ ansible_os_family }}' on {{ inventory_hostname }}. + Supported families: Debian, RedHat. + - name: Include system tools installation tasks ansible.builtin.include_tasks: system-tools.yml -- name: Include Tailscale installation tasks +- name: Include Tailscale installation tasks (Debian/Ubuntu) ansible.builtin.include_tasks: tailscale-linux.yml - when: tailscale_enabled | bool + when: + - tailscale_enabled | bool + - ansible_os_family == 'Debian' + +- name: Include Tailscale installation tasks (RedHat/Fedora) + ansible.builtin.include_tasks: tailscale-redhat.yml + when: + - tailscale_enabled | bool + - ansible_os_family == 'RedHat' - name: Include user creation tasks ansible.builtin.include_tasks: user.yml -- name: Include Docker installation tasks +- name: Include Docker installation tasks (Debian/Ubuntu) ansible.builtin.include_tasks: docker-linux.yml - when: not ci_test + when: + - not ci_test + - ansible_os_family == 'Debian' -- name: Include firewall configuration tasks +- name: Include Docker installation tasks (RedHat/Fedora) + ansible.builtin.include_tasks: docker-redhat.yml + when: + - not ci_test + - ansible_os_family == 'RedHat' + +- name: Include firewall configuration tasks (Debian/Ubuntu) ansible.builtin.include_tasks: firewall-linux.yml - when: not ci_test + when: + - not ci_test + - ansible_os_family == 'Debian' + +- name: Include firewall configuration tasks (RedHat/Fedora) + ansible.builtin.include_tasks: firewall-redhat.yml + when: + - not ci_test + - ansible_os_family == 'RedHat' - name: Include Node.js installation tasks ansible.builtin.include_tasks: nodejs.yml diff --git a/roles/openclaw/tasks/nodejs-debian.yml b/roles/openclaw/tasks/nodejs-debian.yml new file mode 100644 index 0000000..94a4a5b --- /dev/null +++ b/roles/openclaw/tasks/nodejs-debian.yml @@ -0,0 +1,71 @@ +--- +- name: Install required packages for Node.js + ansible.builtin.apt: + name: + - ca-certificates + - curl + - gnupg + state: present + +- name: Create directory for NodeSource GPG key + ansible.builtin.file: + path: /etc/apt/keyrings + state: directory + mode: '0755' + +- name: Add NodeSource GPG key + ansible.builtin.shell: + cmd: | + set -o pipefail + curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | \ + gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg + chmod a+r /etc/apt/keyrings/nodesource.gpg + creates: /etc/apt/keyrings/nodesource.gpg + executable: /bin/bash + +- name: Remove legacy NodeSource deb822 source file + ansible.builtin.file: + path: /etc/apt/sources.list.d/nodesource.sources + state: absent + +- name: Add NodeSource repository + ansible.builtin.copy: + dest: /etc/apt/sources.list.d/nodesource.list + mode: '0644' + content: | + deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_{{ nodejs_version }} nodistro main + +- name: Update apt cache after adding NodeSource repo + ansible.builtin.apt: + update_cache: true + +- name: Install Node.js + ansible.builtin.apt: + name: nodejs + state: present + +- name: Check if pnpm is already installed + ansible.builtin.command: pnpm --version + register: pnpm_check + failed_when: false + changed_when: false + +- name: Install pnpm globally + ansible.builtin.command: npm install -g pnpm + when: pnpm_check.rc != 0 + +- name: Verify Node.js installation + ansible.builtin.command: node --version + register: node_version + changed_when: false + +- name: Verify pnpm installation + ansible.builtin.command: pnpm --version + register: pnpm_version + changed_when: false + +- name: Display Node.js and pnpm versions + ansible.builtin.debug: + msg: + - "Node.js version: {{ node_version.stdout }}" + - "pnpm version: {{ pnpm_version.stdout }}" diff --git a/roles/openclaw/tasks/nodejs-redhat.yml b/roles/openclaw/tasks/nodejs-redhat.yml new file mode 100644 index 0000000..7c6304a --- /dev/null +++ b/roles/openclaw/tasks/nodejs-redhat.yml @@ -0,0 +1,49 @@ +--- +- name: Install required packages for Node.js (RedHat/Fedora) + ansible.builtin.dnf: + name: + - ca-certificates + - curl + - gnupg2 + state: present + update_cache: true + +- name: Add NodeSource repository (RedHat/Fedora) + ansible.builtin.shell: + cmd: | + set -o pipefail + curl -fsSL https://rpm.nodesource.com/setup_{{ nodejs_version }} | bash - + creates: /etc/yum.repos.d/nodesource-nodejs.repo + executable: /bin/bash + +- name: Install Node.js (RedHat/Fedora) + ansible.builtin.dnf: + name: nodejs + state: present + update_cache: true + +- name: Check if pnpm is already installed + ansible.builtin.command: pnpm --version + register: pnpm_check + failed_when: false + changed_when: false + +- name: Install pnpm globally + ansible.builtin.command: npm install -g pnpm + when: pnpm_check.rc != 0 + +- name: Verify Node.js installation + ansible.builtin.command: node --version + register: node_version + changed_when: false + +- name: Verify pnpm installation + ansible.builtin.command: pnpm --version + register: pnpm_version + changed_when: false + +- name: Display Node.js and pnpm versions + ansible.builtin.debug: + msg: + - "Node.js version: {{ node_version.stdout }}" + - "pnpm version: {{ pnpm_version.stdout }}" diff --git a/roles/openclaw/tasks/nodejs.yml b/roles/openclaw/tasks/nodejs.yml index 8c1ecf4..1f58d1c 100644 --- a/roles/openclaw/tasks/nodejs.yml +++ b/roles/openclaw/tasks/nodejs.yml @@ -1,69 +1,8 @@ --- -- name: Install required packages for Node.js - ansible.builtin.apt: - name: - - ca-certificates - - curl - - gnupg - state: present +- name: Include Node.js tasks for Debian/Ubuntu + ansible.builtin.include_tasks: nodejs-debian.yml + when: ansible_os_family == 'Debian' -- name: Create directory for NodeSource GPG key - ansible.builtin.file: - path: /etc/apt/keyrings - state: directory - mode: '0755' - -- name: Add NodeSource GPG key - ansible.builtin.shell: - cmd: | - set -o pipefail - curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | \ - gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg - chmod a+r /etc/apt/keyrings/nodesource.gpg - creates: /etc/apt/keyrings/nodesource.gpg - executable: /bin/bash - -- name: Add NodeSource repository - ansible.builtin.shell: - cmd: | - set -o pipefail - echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] \ - https://deb.nodesource.com/node_{{ nodejs_version }} nodistro main" | \ - tee /etc/apt/sources.list.d/nodesource.list > /dev/null - creates: /etc/apt/sources.list.d/nodesource.list - executable: /bin/bash - -- name: Update apt cache after adding NodeSource repo - ansible.builtin.apt: - update_cache: true - -- name: Install Node.js - ansible.builtin.apt: - name: nodejs - state: present - -- name: Check if pnpm is already installed - ansible.builtin.command: pnpm --version - register: pnpm_check - failed_when: false - changed_when: false - -- name: Install pnpm globally - ansible.builtin.command: npm install -g pnpm - when: pnpm_check.rc != 0 - -- name: Verify Node.js installation - ansible.builtin.command: node --version - register: node_version - changed_when: false - -- name: Verify pnpm installation - ansible.builtin.command: pnpm --version - register: pnpm_version - changed_when: false - -- name: Display Node.js and pnpm versions - ansible.builtin.debug: - msg: - - "Node.js version: {{ node_version.stdout }}" - - "pnpm version: {{ pnpm_version.stdout }}" +- name: Include Node.js tasks for RedHat/Fedora + ansible.builtin.include_tasks: nodejs-redhat.yml + when: ansible_os_family == 'RedHat' diff --git a/roles/openclaw/tasks/system-tools-linux.yml b/roles/openclaw/tasks/system-tools-linux.yml index 712952f..74f5838 100644 --- a/roles/openclaw/tasks/system-tools-linux.yml +++ b/roles/openclaw/tasks/system-tools-linux.yml @@ -1,6 +1,11 @@ --- # Linux-specific system tools installation (apt-based) +- name: Remove legacy NodeSource deb822 source file before apt operations + ansible.builtin.file: + path: /etc/apt/sources.list.d/nodesource.sources + state: absent + - name: Install essential system tools (Linux - apt) ansible.builtin.apt: name: diff --git a/roles/openclaw/tasks/system-tools-redhat.yml b/roles/openclaw/tasks/system-tools-redhat.yml new file mode 100644 index 0000000..3dddca1 --- /dev/null +++ b/roles/openclaw/tasks/system-tools-redhat.yml @@ -0,0 +1,55 @@ +--- +# RedHat/Fedora-specific system tools installation (dnf-based) + +- name: Install essential system tools (RedHat/Fedora - dnf) + ansible.builtin.dnf: + name: + # Editors + - vim-enhanced + - nano + # Version control + - git + - git-lfs + # Network tools + - curl + - wget + - nmap-ncat + - net-tools + - bind-utils + - iputils + - traceroute + - tcpdump + - nmap + - socat + - telnet + # Debugging tools + - strace + - lsof + - gdb + - htop + - iotop + - iftop + - sysstat + - procps-ng + # System utilities + - tmux + - tree + - jq + - unzip + - rsync + - less + # Build essentials for development + - gcc + - gcc-c++ + - make + - file + state: present + update_cache: true + +- name: Deploy global vim configuration (RedHat/Fedora) + ansible.builtin.template: + src: vimrc.j2 + dest: /etc/vimrc.local + owner: root + group: root + mode: '0644' diff --git a/roles/openclaw/tasks/system-tools.yml b/roles/openclaw/tasks/system-tools.yml index d4e0695..13f1e9a 100644 --- a/roles/openclaw/tasks/system-tools.yml +++ b/roles/openclaw/tasks/system-tools.yml @@ -1,8 +1,13 @@ --- -# Main system tools orchestration - Linux only +# Main system tools orchestration by OS family -- name: Include Linux system tools installation +- name: Include Debian/Ubuntu system tools installation ansible.builtin.include_tasks: system-tools-linux.yml + when: ansible_os_family == 'Debian' + +- name: Include RedHat/Fedora system tools installation + ansible.builtin.include_tasks: system-tools-redhat.yml + when: ansible_os_family == 'RedHat' # Common tasks for all operating systems diff --git a/roles/openclaw/tasks/tailscale-redhat.yml b/roles/openclaw/tasks/tailscale-redhat.yml new file mode 100644 index 0000000..c42d305 --- /dev/null +++ b/roles/openclaw/tasks/tailscale-redhat.yml @@ -0,0 +1,44 @@ +--- +# RedHat/Fedora-specific Tailscale installation + +- name: Add Tailscale repository (RedHat/Fedora) + ansible.builtin.get_url: + url: https://pkgs.tailscale.com/stable/fedora/tailscale.repo + dest: /etc/yum.repos.d/tailscale.repo + owner: root + group: root + mode: '0644' + +- name: Install Tailscale + ansible.builtin.dnf: + name: tailscale + state: present + update_cache: true + +- name: Enable Tailscale service (RedHat/Fedora) + ansible.builtin.systemd: + name: tailscaled + enabled: true + state: started + +- name: Check if Tailscale is already connected (RedHat/Fedora) + ansible.builtin.command: tailscale status --json + register: tailscale_status_redhat + changed_when: false + failed_when: false + +- name: Display Tailscale auth URL if not connected (RedHat/Fedora) + ansible.builtin.debug: + msg: + - "============================================" + - "Tailscale installed but not connected yet" + - "============================================" + - "" + - "To connect this machine to your Tailnet:" + - "Run: sudo tailscale up" + - "" + - "For unattended installation, use an auth key:" + - "sudo tailscale up --authkey tskey-auth-xxxxx" + - "" + - "Get auth key from: https://login.tailscale.com/admin/settings/keys" + when: tailscale_status_redhat.rc != 0 diff --git a/roles/openclaw/tasks/user.yml b/roles/openclaw/tasks/user.yml index ef669e4..c2f618d 100644 --- a/roles/openclaw/tasks/user.yml +++ b/roles/openclaw/tasks/user.yml @@ -112,17 +112,17 @@ ansible.builtin.command: "id -u {{ openclaw_user }}" register: openclaw_uid changed_when: false - when: ansible_os_family == 'Debian' and not ci_test + when: ansible_system == 'Linux' and not ci_test - name: Display openclaw user ID ansible.builtin.debug: msg: "OpenClaw user ID: {{ openclaw_uid.stdout }}" - when: ansible_os_family == 'Debian' and not ci_test + when: ansible_system == 'Linux' and not ci_test - name: Enable lingering for openclaw user (allows systemd user services without login) ansible.builtin.command: "loginctl enable-linger {{ openclaw_user }}" changed_when: false - when: ansible_os_family == 'Debian' and not ci_test + when: ansible_system == 'Linux' and not ci_test - name: Create runtime directory for openclaw user ansible.builtin.file: @@ -131,12 +131,12 @@ owner: "{{ openclaw_user }}" group: "{{ openclaw_user }}" mode: '0700' - when: ansible_os_family == 'Debian' and not ci_test + when: ansible_system == 'Linux' and not ci_test - name: Store openclaw UID as fact for later use ansible.builtin.set_fact: openclaw_uid_value: "{{ openclaw_uid.stdout }}" - when: ansible_os_family == 'Debian' and not ci_test + when: ansible_system == 'Linux' and not ci_test # SSH key configuration - name: Create .ssh directory for openclaw user @@ -174,7 +174,7 @@ owner: "{{ openclaw_user }}" group: "{{ openclaw_user }}" mode: '0644' - when: ansible_os_family == 'Debian' and not ci_test + when: ansible_system == 'Linux' and not ci_test - name: Set DBUS_SESSION_BUS_ADDRESS in .bashrc for openclaw user ansible.builtin.blockinfile: @@ -189,4 +189,4 @@ owner: "{{ openclaw_user }}" group: "{{ openclaw_user }}" mode: '0644' - when: ansible_os_family == 'Debian' and not ci_test + when: ansible_system == 'Linux' and not ci_test diff --git a/roles/openclaw_control_plane/templates/control-plane.env.j2 b/roles/openclaw_control_plane/templates/control-plane.env.j2 index 2162764..abe55ea 100644 --- a/roles/openclaw_control_plane/templates/control-plane.env.j2 +++ b/roles/openclaw_control_plane/templates/control-plane.env.j2 @@ -10,4 +10,11 @@ POSTGRES_URL=postgres://{{ profile.postgres_user | default('openclaw') }}:{{ pro TELEGRAM_BOT_TOKEN={{ profile.telegram_bot_token | default('') }} TELEGRAM_DEFAULT_CHAT_ID={{ profile.telegram_default_chat_id | default('') }} ROUTER_FORCED_AGENT={{ profile.router_forced_agent | default('') }} +WORKER_EXEC_MODE={{ profile.worker_exec_mode | default('stub') }} +OPENCLAW_BIN={{ profile.openclaw_bin | default('/home/openclaw/.local/bin/openclaw') }} +OPENCLAW_HOME={{ profile.openclaw_home | default('/home/openclaw') }} +OPENCLAW_ENV_FILE={{ profile.openclaw_env_file | default('/etc/openclaw/secrets/' ~ (profile.gateway_profile | default(profile.name)) ~ '.env') }} +OPENCLAW_GATEWAY_TOKEN={{ profile.openclaw_gateway_token | default('') }} +OPENCLAW_TIMEOUT_MS={{ profile.openclaw_timeout_ms | default(120000) }} +OPENCLAW_BUNDLED_PLUGINS_DIR={{ profile.openclaw_bundled_plugins_dir | default('/home/openclaw/.openclaw/bundled-extensions') }} GRAFANA_ADMIN_PASSWORD={{ profile.grafana_admin_password | default('openclaw') }} diff --git a/roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2 b/roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2 index 9589a15..8c6783b 100644 --- a/roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2 +++ b/roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2 @@ -58,6 +58,17 @@ services: environment: METRICS_PORT: 9411 WORKER_AGENT_ID: main + WORKER_EXEC_MODE: ${WORKER_EXEC_MODE} + OPENCLAW_BIN: ${OPENCLAW_BIN} + OPENCLAW_HOME: ${OPENCLAW_HOME} + OPENCLAW_ENV_FILE: ${OPENCLAW_ENV_FILE} + OPENCLAW_TIMEOUT_MS: ${OPENCLAW_TIMEOUT_MS} + OPENCLAW_BUNDLED_PLUGINS_DIR: ${OPENCLAW_BUNDLED_PLUGINS_DIR} +{% if profile.worker_exec_mode | default('stub') == 'openclaw' %} + user: "994:994" + volumes: + - /home/openclaw:/home/openclaw +{% endif %} restart: unless-stopped depends_on: - nats From 6fcf8c15059e610c1604977cf06fa03a41e5eeb7 Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 19:55:42 -0300 Subject: [PATCH 11/16] feat(ops): add codex auth-sync workflow and harden install validation --- .gitignore | 2 + Makefile | 18 +- README.md | 6 +- docs/operations-workflow.md | 49 +++- docs/operator-runbook.md | 67 +++-- ops/auth-sync.sh | 235 ++++++++++++++++++ ops/install.sh | 3 + ops/oauth-login.sh | 14 +- ops/secrets-refactor.sh | 78 ++++++ ops/smoke.sh | 23 +- ops/validate-secrets.sh | 51 ++++ .../openclaw_control_plane/tasks/profile.yml | 35 +++ 12 files changed, 518 insertions(+), 63 deletions(-) create mode 100755 ops/auth-sync.sh create mode 100755 ops/secrets-refactor.sh create mode 100755 ops/validate-secrets.sh diff --git a/.gitignore b/.gitignore index 29d7b1e..a723b2b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.log .ansible/ .vault_pass +backups/ # Secrets and credentials *.env @@ -22,3 +23,4 @@ group_vars/ !inventories/**/host_vars/ !inventories/**/host_vars/.gitkeep inventories/**/group_vars/vault.yml +inventories/**/group_vars/vault.manual.refactor.yml diff --git a/Makefile b/Makefile index 0a407ca..2dddd66 100644 --- a/Makefile +++ b/Makefile @@ -7,8 +7,9 @@ INVENTORY ?= inventories/$(ENV)/hosts.yml LIMIT ?= zennook PROFILES ?= dev-main andrea OAUTH_PROVIDER ?= openai-codex +MODEL_REF ?= openai-codex/gpt-5.3-codex -.PHONY: help backup purge install cloudflare oauth-login smoke reinstall +.PHONY: help backup purge install cloudflare auth-sync oauth-login smoke reinstall secrets-refactor help: @echo "OpenClaw Ops Targets" @@ -16,14 +17,16 @@ help: @echo " make backup Backup current OpenClaw + control-plane state" @echo " make purge CONFIRM=1 Purge deployed state and containers" @echo " make install Install/reconcile enterprise + control-plane" + @echo " make secrets-refactor Build manual secrets migration file + validate vault" @echo " make cloudflare Reconcile Cloudflare tunnel/service only" - @echo " make oauth-login Run interactive OAuth login per profile" + @echo " make auth-sync Sync Codex creds from /home/efra/.codex to OpenClaw profiles" + @echo " make oauth-login Alias to make auth-sync (legacy name)" @echo " make smoke Run post-install smoke checks" @echo " make reinstall CONFIRM=1 backup + purge + install + smoke" @echo "" @echo "Variables:" @echo " ENV=$(ENV) INVENTORY=$(INVENTORY) LIMIT=$(LIMIT)" - @echo " PROFILES='$(PROFILES)' OAUTH_PROVIDER=$(OAUTH_PROVIDER)" + @echo " PROFILES='$(PROFILES)' OAUTH_PROVIDER=$(OAUTH_PROVIDER) MODEL_REF=$(MODEL_REF)" backup: @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/backup.sh @@ -35,11 +38,16 @@ purge: install: @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/install.sh +secrets-refactor: + @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/secrets-refactor.sh + cloudflare: @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/cloudflare-reconcile.sh -oauth-login: - @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" PROFILES="$(PROFILES)" OAUTH_PROVIDER="$(OAUTH_PROVIDER)" ./ops/oauth-login.sh +auth-sync: + @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" PROFILES="$(PROFILES)" OAUTH_PROVIDER="$(OAUTH_PROVIDER)" MODEL_REF="$(MODEL_REF)" ./ops/auth-sync.sh + +oauth-login: auth-sync smoke: @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/smoke.sh diff --git a/README.md b/README.md index f3433d8..2584b43 100644 --- a/README.md +++ b/README.md @@ -148,7 +148,7 @@ ansible-playbook playbook.yml --ask-become-pass ## Documentation -- [Operator Runbook](docs/operator-runbook.md) - End-to-end profile/agent/OAuth/queue operations guide +- [Operator Runbook](docs/operator-runbook.md) - End-to-end profile/agent/auth-sync/queue operations guide - [Configuration Guide](docs/configuration.md) - All configuration options - [Development Mode](docs/development-mode.md) - Build from source - [Security Architecture](docs/security.md) - Security details @@ -179,7 +179,9 @@ make install # Reconcile only Cloudflare tunnel/service (if enabled in inventory) make cloudflare -# Interactive OAuth for Codex provider +# Non-interactive Codex credential sync +make auth-sync PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex +# legacy alias (same behavior) make oauth-login PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex # Validate full flow diff --git a/docs/operations-workflow.md b/docs/operations-workflow.md index 8655305..6a96cc0 100644 --- a/docs/operations-workflow.md +++ b/docs/operations-workflow.md @@ -10,8 +10,9 @@ This repository provides a Makefile interface over `ops/*.sh` scripts: - `make backup` - `make purge CONFIRM=1` - `make install` +- `make secrets-refactor` - `make cloudflare` -- `make oauth-login` +- `make auth-sync` - `make smoke` - `make reinstall CONFIRM=1` @@ -20,17 +21,57 @@ This repository provides a Makefile interface over `ops/*.sh` scripts: - `Makefile`: stable operator commands. - `ops/*.sh`: implementation details, safe to extend. -## OAuth note (Codex) +## Auth sync (Codex) -`openai-codex` login is interactive by design (browser OAuth callback). -It cannot be made fully non-interactive without changing provider auth semantics. +Credential sync is now non-interactive and uses Codex auth files from the `efra` user. Use: ```bash +make auth-sync PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex +# legacy alias (same behavior) make oauth-login PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex ``` +Optional environment overrides (loaded from `/home/efra/.env` when present): + +- `EFRA_CODEX_HOME` (default: `/home/efra/.codex`) +- `EFRA_CODEX_AUTH_DEFAULT` (default: `/home/efra/.codex/auth.json`) +- `EFRA_CODEX_AUTH_ANDREA` (default: `/home/efra/.codex/auth-andrea.json`) + +The sync process: + +- copies auth files to `/home/openclaw/.codex/` +- writes `openai-codex` OAuth profiles into each target profile's `auth-profiles.json` +- sets profile default model to `openai-codex/gpt-5.3-codex` (configurable with `MODEL_REF`) + +Runtime command environment still auto-loads `/etc/openclaw/secrets/.env` and exports: + +- `HOME=/home/openclaw` +- `OPENCLAW_BUNDLED_PLUGINS_DIR=/home/openclaw/.openclaw/bundled-extensions` + +## Manual secrets refactor + +Before a clean install, generate and review a manual migration file: + +```bash +make secrets-refactor ENV=dev LIMIT=zennook +``` + +This creates: + +- `inventories/dev/group_vars/vault.manual.refactor.yml` + +Then: + +```bash +# review and merge into vault.yml +./ops/validate-secrets.sh +``` + +`make install` now runs `./ops/validate-secrets.sh` first and aborts if required keys are +missing or still using placeholder values. + Cloudflare reconcile (subdomain exposure only): ```bash diff --git a/docs/operator-runbook.md b/docs/operator-runbook.md index d2912e9..449d789 100644 --- a/docs/operator-runbook.md +++ b/docs/operator-runbook.md @@ -1,5 +1,5 @@ --- -title: Operator Runbook (Profiles, Agents, OAuth, Queues) +title: Operator Runbook (Profiles, Agents, Auth Sync, Queues) summary: End-to-end step-by-step guide to deploy, operate, extend, and troubleshoot multi-profile OpenClaw with Stage 2 control-plane. --- @@ -10,7 +10,7 @@ This runbook is the canonical step-by-step guide to: - Install and reconcile OpenClaw with Ansible. - Operate multi-profile gateways (`dev-main`, `andrea`, and new profiles). - Operate Stage 2 control-plane (`full` and `lite`). -- Login providers (OpenAI Codex OAuth) per profile. +- Sync OpenAI Codex credentials per profile. - Create new profiles and new agents safely. - Route Telegram traffic to agents and validate queue execution. @@ -120,7 +120,9 @@ make install ENV=dev LIMIT=zennook # 4) Optional Cloudflare reconcile make cloudflare ENV=dev LIMIT=zennook -# 5) OAuth login (interactive browser flow) +# 5) Credential sync (non-interactive) +make auth-sync ENV=dev LIMIT=zennook PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex +# legacy alias make oauth-login ENV=dev LIMIT=zennook PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex # 6) Run smoke tests @@ -133,42 +135,35 @@ Equivalent direct Ansible command: ansible-playbook -i inventories/dev/hosts.yml playbooks/enterprise.yml --ask-become-pass --limit zennook ``` -## 4. Provider Login (OpenAI Codex OAuth) +## 4. Provider Auth Sync (OpenAI Codex) -### 4.1 Why OAuth per profile +### 4.1 Why sync per profile -Auth state is profile-specific (`--profile `). -If you run multiple profiles, log in once for each profile. +Auth state is profile-specific (`--profile `), and each profile has its own +`auth-profiles.json` under its `agents/*/agent` directories. -### 4.2 Standard login command +### 4.2 Standard sync command ```bash -sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile dev-main models auth login --provider openai-codex -sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile andrea models auth login --provider openai-codex +make auth-sync PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex ``` -### 4.3 OAuth flow details +`make oauth-login` remains available as a compatibility alias and runs the same sync workflow. -```mermaid -sequenceDiagram - participant CLI as openclaw CLI - participant B as Browser - participant O as OpenAI OAuth - participant L as localhost:1455 callback - participant P as OpenClaw profile state - - CLI->>B: Open authorize URL - B->>O: User signs in and approves - O->>L: Redirect with auth code - CLI->>P: Store OAuth credentials for profile - CLI-->>CLI: provider ready in this profile -``` +### 4.3 Credential sources + +By default: + +- `EFRA_CODEX_AUTH_DEFAULT=/home/efra/.codex/auth.json` +- `EFRA_CODEX_AUTH_ANDREA=/home/efra/.codex/auth-andrea.json` + +If `/home/efra/.env` exists, `auth-sync` loads it first so these paths can be overridden. -### 4.4 Verify login +### 4.4 Verify auth/model state ```bash -sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile dev-main status --all -sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile dev-main models list +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile dev-main models status --probe +sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile andrea models status --probe ``` ## 5. Create a New Profile (Gateway + Optional Control-Plane) @@ -253,7 +248,7 @@ sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile ops-lab status ### 5.5 Login provider for new profile ```bash -sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile ops-lab models auth login --provider openai-codex +make auth-sync PROFILES="ops-lab" OAUTH_PROVIDER=openai-codex ``` ## 6. Create a New Agent Inside an Existing Profile @@ -430,14 +425,16 @@ Correct: sudo -u openclaw -H /home/openclaw/.local/bin/openclaw --profile dev-main doctor --fix ``` -### 10.3 OAuth command says no provider plugins found +### 10.3 Auth sync cannot read Codex credentials Action: -1. Ensure correct OpenClaw install/profile. -2. Ensure bundled plugins path exists: - - `/home/openclaw/.openclaw/bundled-extensions` -3. Prefer onboarding wizard for first-time provider setup if plugin state is inconsistent. +1. Ensure `/home/efra/.codex/auth.json` exists. +2. If using per-profile creds, ensure `/home/efra/.codex/auth-andrea.json` exists. +3. If custom paths are needed, export them in `/home/efra/.env`: + - `EFRA_CODEX_AUTH_DEFAULT=...` + - `EFRA_CODEX_AUTH_ANDREA=...` +4. Re-run `make auth-sync`. ### 10.4 Browser task hangs or no Telegram response @@ -464,7 +461,7 @@ When adding profiles/agents: 1. Update inventory + Vault. 2. `make install`. -3. Run OAuth login for target profiles. +3. Run auth sync for target profiles. 4. Run smoke checks. 5. Validate Telegram E2E. 6. Commit changes with docs + inventory + role/template updates together. diff --git a/ops/auth-sync.sh b/ops/auth-sync.sh new file mode 100755 index 0000000..67f2822 --- /dev/null +++ b/ops/auth-sync.sh @@ -0,0 +1,235 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ops/common.sh +source "${SCRIPT_DIR}/common.sh" + +provider="${OAUTH_PROVIDER:-openai-codex}" +profiles_raw="${PROFILES:-dev-main andrea}" +model_ref="${MODEL_REF:-openai-codex/gpt-5.3-codex}" + +efra_env_file="${EFRA_ENV_FILE:-/home/efra/.env}" +if [[ -f "${efra_env_file}" ]]; then + # shellcheck disable=SC1090 + source "${efra_env_file}" +fi + +source_codex_home="${EFRA_CODEX_HOME:-/home/efra/.codex}" +default_auth="${EFRA_CODEX_AUTH_DEFAULT:-${source_codex_home}/auth.json}" +andrea_auth="${EFRA_CODEX_AUTH_ANDREA:-${source_codex_home}/auth-andrea.json}" + +[[ "${provider}" == "openai-codex" ]] || die "auth-sync supports only OAUTH_PROVIDER=openai-codex" +[[ -f "${default_auth}" ]] || die "Missing default Codex auth file: ${default_auth}" + +if [[ ! -f "${andrea_auth}" ]]; then + log "Andrea auth file not found (${andrea_auth}); falling back to default credential." + andrea_auth="${default_auth}" +fi + +log "Syncing Codex credentials from ${source_codex_home} into OpenClaw profiles: ${profiles_raw}" + +run_sudo env \ + PROFILES_RAW="${profiles_raw}" \ + DEFAULT_AUTH="${default_auth}" \ + ANDREA_AUTH="${andrea_auth}" \ + node - <<'NODE' +const fs = require("fs"); +const path = require("path"); +const cp = require("child_process"); + +function decodeJwtPayload(token) { + try { + const parts = String(token || "").split("."); + if (parts.length < 2) { + return null; + } + const b64 = parts[1].replace(/-/g, "+").replace(/_/g, "/"); + const pad = "=".repeat((4 - (b64.length % 4)) % 4); + return JSON.parse(Buffer.from(b64 + pad, "base64").toString("utf8")); + } catch { + return null; + } +} + +function readCredential(filePath) { + const raw = JSON.parse(fs.readFileSync(filePath, "utf8")); + const tokens = raw && raw.tokens ? raw.tokens : {}; + const access = tokens.access_token; + const refresh = tokens.refresh_token; + const accountId = tokens.account_id; + if (typeof access !== "string" || !access || typeof refresh !== "string" || !refresh) { + throw new Error(`Missing access/refresh token in ${filePath}`); + } + + const accessPayload = decodeJwtPayload(access) || {}; + const idPayload = decodeJwtPayload(tokens.id_token) || {}; + + const expSec = Number(accessPayload.exp); + const expires = Number.isFinite(expSec) && expSec > 0 ? expSec * 1000 : Date.now() + 60 * 60 * 1000; + const email = + typeof idPayload.email === "string" && idPayload.email.trim() ? idPayload.email.trim() : "default"; + + const credential = { + type: "oauth", + provider: "openai-codex", + access, + refresh, + expires, + }; + if (typeof accountId === "string" && accountId) { + credential.accountId = accountId; + } + if (email !== "default") { + credential.email = email; + } + + return { credential, email }; +} + +function ensureDir(dirPath, uid, gid) { + fs.mkdirSync(dirPath, { recursive: true, mode: 0o700 }); + fs.chownSync(dirPath, uid, gid); +} + +function loadStore(storePath) { + try { + const parsed = JSON.parse(fs.readFileSync(storePath, "utf8")); + if (!parsed || typeof parsed !== "object") { + return { version: 1, profiles: {} }; + } + if (!parsed.profiles || typeof parsed.profiles !== "object") { + parsed.profiles = {}; + } + if (typeof parsed.version !== "number") { + parsed.version = 1; + } + return parsed; + } catch { + return { version: 1, profiles: {} }; + } +} + +function writeStore(storePath, store, uid, gid) { + ensureDir(path.dirname(storePath), uid, gid); + fs.writeFileSync(storePath, `${JSON.stringify(store, null, 2)}\n`, { mode: 0o600 }); + fs.chownSync(storePath, uid, gid); + fs.chmodSync(storePath, 0o600); +} + +function resolveProfileDir(profileName) { + if (profileName === "default" || profileName === "main") { + return "/home/openclaw/.openclaw"; + } + return `/home/openclaw/.openclaw-${profileName}`; +} + +function collectAgentDirs(profileDir) { + const dirs = new Set([path.join(profileDir, "agents", "main", "agent")]); + const configPath = path.join(profileDir, "openclaw.json"); + + try { + const cfg = JSON.parse(fs.readFileSync(configPath, "utf8")); + const list = cfg && cfg.agents && Array.isArray(cfg.agents.list) ? cfg.agents.list : []; + for (const item of list) { + if (!item || typeof item !== "object") { + continue; + } + if (typeof item.agentDir === "string" && item.agentDir.trim()) { + dirs.add(item.agentDir.trim()); + continue; + } + const id = typeof item.id === "string" && item.id.trim() ? item.id.trim() : "main"; + dirs.add(path.join(profileDir, "agents", id, "agent")); + } + } catch { + // Keep default main agent dir. + } + + return Array.from(dirs); +} + +const profilesRaw = process.env.PROFILES_RAW || "dev-main andrea"; +const profiles = profilesRaw + .split(/\s+/) + .map((v) => v.trim()) + .filter(Boolean); + +if (profiles.length === 0) { + throw new Error("PROFILES_RAW resolved to an empty profile list."); +} + +const defaultAuth = process.env.DEFAULT_AUTH; +const andreaAuth = process.env.ANDREA_AUTH || defaultAuth; +if (!defaultAuth) { + throw new Error("DEFAULT_AUTH is required."); +} + +const defaultCred = readCredential(defaultAuth); +const andreaCred = readCredential(andreaAuth); + +const uid = Number(cp.execSync("id -u openclaw", { encoding: "utf8" }).trim()); +const gid = Number(cp.execSync("id -g openclaw", { encoding: "utf8" }).trim()); + +const codexDir = "/home/openclaw/.codex"; +ensureDir(codexDir, uid, gid); +for (const [src, name] of [ + [defaultAuth, "auth.json"], + [andreaAuth, "auth-andrea.json"], +]) { + fs.copyFileSync(src, path.join(codexDir, name)); + fs.chownSync(path.join(codexDir, name), uid, gid); + fs.chmodSync(path.join(codexDir, name), 0o600); +} + +let stores = 0; +for (const profile of profiles) { + const profileDir = resolveProfileDir(profile); + const selected = profile === "andrea" ? andreaCred : defaultCred; + const agentDirs = collectAgentDirs(profileDir); + + for (const agentDir of agentDirs) { + ensureDir(agentDir, uid, gid); + const storePath = path.join(agentDir, "auth-profiles.json"); + const store = loadStore(storePath); + store.version = 1; + if (!store.profiles || typeof store.profiles !== "object") { + store.profiles = {}; + } + + store.profiles["openai-codex:default"] = selected.credential; + if (!store.order || typeof store.order !== "object") { + store.order = {}; + } + + if (selected.email && selected.email !== "default") { + const emailProfile = `openai-codex:${selected.email}`; + store.profiles[emailProfile] = selected.credential; + store.order["openai-codex"] = [emailProfile, "openai-codex:default"]; + } else { + store.order["openai-codex"] = ["openai-codex:default"]; + } + + writeStore(storePath, store, uid, gid); + stores += 1; + } + + console.log(`Synced auth-profiles for profile=${profile} agentDirs=${agentDirs.length}`); +} + +console.log(`SYNC_OK profiles=${profiles.length} stores=${stores}`); +NODE + +for profile in ${profiles_raw}; do + log "Configuring default model for profile=${profile} -> ${model_ref}" + profile_env="/etc/openclaw/secrets/${profile}.env" + run_sudo -u openclaw -H bash -lc \ + "set -euo pipefail; \ + export HOME=/home/openclaw; \ + export OPENCLAW_BUNDLED_PLUGINS_DIR=/home/openclaw/.openclaw/bundled-extensions; \ + if [[ -f '${profile_env}' ]]; then set -a; source '${profile_env}'; set +a; fi; \ + /home/openclaw/.local/bin/openclaw --profile '${profile}' models set '${model_ref}' >/dev/null; \ + /home/openclaw/.local/bin/openclaw --profile '${profile}' models status --plain" +done + +log "Credential sync completed for profiles: ${profiles_raw}" diff --git a/ops/install.sh b/ops/install.sh index ddbe23e..a07e1f9 100755 --- a/ops/install.sh +++ b/ops/install.sh @@ -15,6 +15,9 @@ need_cmd "${ansible_bin}" [[ -f "${inventory_file}" ]] || die "Inventory not found: ${inventory_file}" +log "Validating required secrets before install." +"${SCRIPT_DIR}/validate-secrets.sh" + extra_args=() if [[ -f "${vault_file}" ]]; then extra_args+=( -e "@${vault_file}" ) diff --git a/ops/oauth-login.sh b/ops/oauth-login.sh index 0193756..b504df9 100755 --- a/ops/oauth-login.sh +++ b/ops/oauth-login.sh @@ -5,15 +5,5 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck source=ops/common.sh source "${SCRIPT_DIR}/common.sh" -provider="${OAUTH_PROVIDER:-openai-codex}" -profiles_raw="${PROFILES:-dev-main andrea}" - -log "Starting interactive OAuth login for provider=${provider} profiles=${profiles_raw}" - -for profile in ${profiles_raw}; do - log "OAuth login for profile=${profile}" - run_sudo -u openclaw -H bash -lc \ - "/home/openclaw/.local/bin/openclaw --profile '${profile}' models auth login --provider '${provider}'" -done - -log "OAuth login flow completed for all profiles." +log "make oauth-login now delegates to non-interactive credential sync (auth-sync)." +exec "${SCRIPT_DIR}/auth-sync.sh" diff --git a/ops/secrets-refactor.sh b/ops/secrets-refactor.sh new file mode 100755 index 0000000..201a52e --- /dev/null +++ b/ops/secrets-refactor.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ops/common.sh +source "${SCRIPT_DIR}/common.sh" + +need_cmd awk +need_cmd sed +need_cmd tr + +inventory_file="$(resolve_inventory)" +inventory_dir="$(cd "$(dirname "${inventory_file}")" && pwd)" +vault_file="${VAULT_FILE:-${inventory_dir}/group_vars/vault.yml}" +manual_file="${inventory_dir}/group_vars/vault.manual.refactor.yml" + +extract_env_value() { + local file="$1" + local key="$2" + if run_sudo test -f "${file}"; then + run_sudo awk -F= -v key="$key" '$1 == key {print $2; exit}' "${file}" | tr -d '\r' || true + fi +} + +current_or_default() { + local key="$1" + local fallback="$2" + if [[ -f "${vault_file}" ]]; then + local current + current="$(awk -F': *' -v key="$key" '$1 == key {sub(/^["'"'"']/, "", $2); sub(/["'"'"']$/, "", $2); print $2; exit}' "${vault_file}")" + if [[ -n "${current}" ]]; then + printf '%s' "${current}" + return 0 + fi + fi + printf '%s' "${fallback}" +} + +dev_main_gateway_token="$(extract_env_value /etc/openclaw/secrets/dev-main.env OPENCLAW_GATEWAY_TOKEN)" +andrea_gateway_token="$(extract_env_value /etc/openclaw/secrets/andrea.env OPENCLAW_GATEWAY_TOKEN)" + +efra_cp_postgres="$(extract_env_value /home/efra/openclaw-control-plane/efra-core/.env POSTGRES_PASSWORD)" +efra_cp_nats="$(extract_env_value /home/efra/openclaw-control-plane/efra-core/.env NATS_PASSWORD)" +efra_tg_token="$(extract_env_value /home/efra/openclaw-control-plane/efra-core/.env TELEGRAM_BOT_TOKEN)" +efra_tg_chat="$(extract_env_value /home/efra/openclaw-control-plane/efra-core/.env TELEGRAM_DEFAULT_CHAT_ID)" + +andrea_cp_postgres="$(extract_env_value /home/efra/openclaw-control-plane/andrea/.env POSTGRES_PASSWORD)" +andrea_cp_nats="$(extract_env_value /home/efra/openclaw-control-plane/andrea/.env NATS_PASSWORD)" +andrea_tg_token="$(extract_env_value /home/efra/openclaw-control-plane/andrea/.env TELEGRAM_BOT_TOKEN)" +andrea_tg_chat="$(extract_env_value /home/efra/openclaw-control-plane/andrea/.env TELEGRAM_DEFAULT_CHAT_ID)" + +mkdir -p "${inventory_dir}/group_vars" +umask 077 +cat > "${manual_file}" </dev/null || true)" + if [[ -n "${task_json}" ]]; then + status="$(printf '%s' "${task_json}" | sed -n 's/.*"status":"\([^"]*\)".*/\1/p')" + if [[ -n "${status}" && "${status}" != "PENDING" && "${status}" != "QUEUED" && "${status}" != "RUNNING" ]]; then + break + fi + fi + sleep 1 + done - log "Queue flow OK (${profile_label}) taskId=${task_id}" + [[ -n "${status}" ]] || die "Task ${task_id} did not become visible in control API (${profile_label})." + if [[ "${status}" == "PENDING" || "${status}" == "QUEUED" || "${status}" == "RUNNING" ]]; then + die "Task ${task_id} did not reach terminal status in time (${profile_label}). Last status=${status}" + fi + + log "Queue flow OK (${profile_label}) taskId=${task_id} status=${status}" } log "Checking docker compose stack status." diff --git a/ops/validate-secrets.sh b/ops/validate-secrets.sh new file mode 100755 index 0000000..289c29d --- /dev/null +++ b/ops/validate-secrets.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ops/common.sh +source "${SCRIPT_DIR}/common.sh" + +inventory_file="$(resolve_inventory)" +inventory_dir="$(cd "$(dirname "${inventory_file}")" && pwd)" +vault_file="${VAULT_FILE:-${inventory_dir}/group_vars/vault.yml}" + +[[ -f "${vault_file}" ]] || die "Missing vault file: ${vault_file}" + +read_yaml_value() { + local key="$1" + awk -F': *' -v key="$key" '$1 == key {sub(/^["'"'"']/, "", $2); sub(/["'"'"']$/, "", $2); print $2; exit}' "${vault_file}" +} + +is_placeholder() { + local v="${1:-}" + [[ -z "${v}" ]] && return 0 + [[ "${v}" =~ replace-with|replace-me|changeme|example|dummy|temp-token|temp-key ]] +} + +required_keys=( + "vault_openclaw_gateway_token_dev_main" + "vault_openclaw_gateway_token_andrea" + "vault_openclaw_cp_postgres_password_efra_core" + "vault_openclaw_cp_nats_password_efra_core" + "vault_openclaw_cp_postgres_password_andrea" + "vault_openclaw_cp_nats_password_andrea" +) + +missing=() +for key in "${required_keys[@]}"; do + value="$(read_yaml_value "${key}")" + if is_placeholder "${value}"; then + missing+=("${key}") + fi +done + +if (( ${#missing[@]} > 0 )); then + printf '[ops] ERROR: vault secrets missing or placeholders detected:\n' >&2 + for key in "${missing[@]}"; do + printf ' - %s\n' "${key}" >&2 + done + printf '[ops] Run: make secrets-refactor\n' >&2 + exit 1 +fi + +log "Vault secret validation passed: ${vault_file}" diff --git a/roles/openclaw_control_plane/tasks/profile.yml b/roles/openclaw_control_plane/tasks/profile.yml index 8673fa4..efef6fd 100644 --- a/roles/openclaw_control_plane/tasks/profile.yml +++ b/roles/openclaw_control_plane/tasks/profile.yml @@ -79,6 +79,41 @@ remove_orphans: true when: openclaw_control_plane_manage_stack | bool +- name: Wait for profile postgres container to accept local connections + ansible.builtin.shell: | + docker exec {{ openclaw_control_plane_project_prefix }}-{{ profile.name }}-postgres-1 \ + sh -lc "export PGPASSWORD='{{ profile.postgres_admin_password | default(profile.postgres_password) }}'; \ + psql -h 127.0.0.1 -U {{ profile.postgres_admin_user | default(profile.postgres_user | default('openclaw')) }} \ + -d {{ profile.postgres_db | default('openclaw_control') }} -tAc 'select 1'" + register: profile_postgres_ready + retries: 30 + delay: 2 + until: + - profile_postgres_ready.rc == 0 + - "'1' in profile_postgres_ready.stdout" + changed_when: false + when: openclaw_control_plane_manage_stack | bool + +- name: Reconcile postgres role password for control-plane profile + ansible.builtin.shell: | + docker exec {{ openclaw_control_plane_project_prefix }}-{{ profile.name }}-postgres-1 \ + sh -lc "export PGPASSWORD='{{ profile.postgres_admin_password | default(profile.postgres_password) }}'; \ + psql -h 127.0.0.1 -U {{ profile.postgres_admin_user | default(profile.postgres_user | default('openclaw')) }} \ + -d {{ profile.postgres_db | default('openclaw_control') }} -v ON_ERROR_STOP=1 <<'SQL' + DO \$\$ + BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = '{{ profile.postgres_user | default('openclaw') }}') THEN + CREATE ROLE {{ profile.postgres_user | default('openclaw') }} LOGIN PASSWORD '{{ profile.postgres_password }}'; + ELSE + ALTER ROLE {{ profile.postgres_user | default('openclaw') }} WITH LOGIN PASSWORD '{{ profile.postgres_password }}'; + END IF; + END + \$\$; + SQL" + register: profile_postgres_reconcile + changed_when: false + when: openclaw_control_plane_manage_stack | bool + - name: Probe control API health ansible.builtin.uri: url: "http://127.0.0.1:{{ profile.control_api_port | default(39090) }}/health" From 7652454c4d399b8bd6206174d1ed790427449556 Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 20:21:22 -0300 Subject: [PATCH 12/16] fix: address review findings in control-plane and gateway role --- .../src/control-api/control.service.ts | 21 ++++++++++++- .../openclaw_control_plane/defaults/main.yml | 2 ++ .../openclaw_control_plane/tasks/profile.yml | 30 +++++++++++-------- .../templates/control-plane.env.j2 | 2 ++ .../templates/docker-compose.full.yml.j2 | 8 ++--- .../templates/docker-compose.lite.yml.j2 | 2 +- .../openclaw-gateway-profile.service.j2 | 2 +- 7 files changed, 48 insertions(+), 19 deletions(-) diff --git a/control-plane/src/control-api/control.service.ts b/control-plane/src/control-api/control.service.ts index b9a81b9..fa3fc67 100644 --- a/control-plane/src/control-api/control.service.ts +++ b/control-plane/src/control-api/control.service.ts @@ -61,6 +61,25 @@ export class ControlService implements OnModuleInit, OnModuleDestroy { createdAt: new Date().toISOString() }; + const nextStatus = decision === 'confirm' ? 'DONE' : 'FAILED'; + + const updated = await this.pg?.query( + ` + UPDATE tasks + SET status = $1, + needs_confirmation = FALSE, + updated_at = NOW() + WHERE task_id = $2 + AND profile = $3 + AND needs_confirmation = TRUE + `, + [nextStatus, taskId, this.cfg.profile] + ); + + if ((updated?.rowCount ?? 0) === 0) { + this.logger.warn(`Decision ${decision} for task ${taskId} did not match a pending confirmation row`); + } + this.nc?.publish(`control.${decision}.${this.cfg.profile}`, encodeJson(command)); await this.pg?.query( @@ -68,7 +87,7 @@ export class ControlService implements OnModuleInit, OnModuleDestroy { INSERT INTO task_events (task_id, profile, event_type, from_agent, payload) VALUES ($1, $2, $3, $4, $5::jsonb) `, - [taskId, this.cfg.profile, `decision_${decision}`, actor, JSON.stringify(command)] + [taskId, this.cfg.profile, `decision_${decision}`, actor, JSON.stringify({ ...command, status: nextStatus })] ); } diff --git a/roles/openclaw_control_plane/defaults/main.yml b/roles/openclaw_control_plane/defaults/main.yml index bee3fb7..f8ab83b 100644 --- a/roles/openclaw_control_plane/defaults/main.yml +++ b/roles/openclaw_control_plane/defaults/main.yml @@ -10,6 +10,8 @@ openclaw_control_plane_owner: "{{ ansible_user | default('openclaw') }}" openclaw_control_plane_group: "{{ ansible_user | default('openclaw') }}" openclaw_control_plane_env_owner: "{{ openclaw_control_plane_owner }}" openclaw_control_plane_env_group: "{{ openclaw_control_plane_group }}" +openclaw_control_plane_worker_uid: "{{ openclaw_uid_value | default('1000') }}" +openclaw_control_plane_worker_gid: "{{ openclaw_gid_value | default(openclaw_control_plane_worker_uid) }}" openclaw_control_plane_health_retries: 20 openclaw_control_plane_health_delay: 3 diff --git a/roles/openclaw_control_plane/tasks/profile.yml b/roles/openclaw_control_plane/tasks/profile.yml index efef6fd..8073358 100644 --- a/roles/openclaw_control_plane/tasks/profile.yml +++ b/roles/openclaw_control_plane/tasks/profile.yml @@ -81,10 +81,13 @@ - name: Wait for profile postgres container to accept local connections ansible.builtin.shell: | - docker exec {{ openclaw_control_plane_project_prefix }}-{{ profile.name }}-postgres-1 \ - sh -lc "export PGPASSWORD='{{ profile.postgres_admin_password | default(profile.postgres_password) }}'; \ - psql -h 127.0.0.1 -U {{ profile.postgres_admin_user | default(profile.postgres_user | default('openclaw')) }} \ - -d {{ profile.postgres_db | default('openclaw_control') }} -tAc 'select 1'" + docker exec \ + -e PGPASSWORD='{{ profile.postgres_admin_password | default(profile.postgres_password) | replace("'", "'\"'\"'") }}' \ + {{ openclaw_control_plane_project_prefix }}-{{ profile.name }}-postgres-1 \ + psql -h 127.0.0.1 \ + -U {{ profile.postgres_admin_user | default(profile.postgres_user | default('openclaw')) }} \ + -d {{ profile.postgres_db | default('openclaw_control') }} \ + -tAc 'select 1' register: profile_postgres_ready retries: 30 delay: 2 @@ -96,27 +99,30 @@ - name: Reconcile postgres role password for control-plane profile ansible.builtin.shell: | - docker exec {{ openclaw_control_plane_project_prefix }}-{{ profile.name }}-postgres-1 \ - sh -lc "export PGPASSWORD='{{ profile.postgres_admin_password | default(profile.postgres_password) }}'; \ - psql -h 127.0.0.1 -U {{ profile.postgres_admin_user | default(profile.postgres_user | default('openclaw')) }} \ - -d {{ profile.postgres_db | default('openclaw_control') }} -v ON_ERROR_STOP=1 <<'SQL' + docker exec -i \ + -e PGPASSWORD='{{ profile.postgres_admin_password | default(profile.postgres_password) | replace("'", "'\"'\"'") }}' \ + {{ openclaw_control_plane_project_prefix }}-{{ profile.name }}-postgres-1 \ + psql -h 127.0.0.1 \ + -U {{ profile.postgres_admin_user | default(profile.postgres_user | default('openclaw')) }} \ + -d {{ profile.postgres_db | default('openclaw_control') }} \ + -v ON_ERROR_STOP=1 <<'SQL' DO \$\$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = '{{ profile.postgres_user | default('openclaw') }}') THEN - CREATE ROLE {{ profile.postgres_user | default('openclaw') }} LOGIN PASSWORD '{{ profile.postgres_password }}'; + CREATE ROLE {{ profile.postgres_user | default('openclaw') }} LOGIN PASSWORD '{{ profile.postgres_password | replace("'", "''") }}'; ELSE - ALTER ROLE {{ profile.postgres_user | default('openclaw') }} WITH LOGIN PASSWORD '{{ profile.postgres_password }}'; + ALTER ROLE {{ profile.postgres_user | default('openclaw') }} WITH LOGIN PASSWORD '{{ profile.postgres_password | replace("'", "''") }}'; END IF; END \$\$; - SQL" + SQL register: profile_postgres_reconcile changed_when: false when: openclaw_control_plane_manage_stack | bool - name: Probe control API health ansible.builtin.uri: - url: "http://127.0.0.1:{{ profile.control_api_port | default(39090) }}/health" + url: "http://127.0.0.1:{{ profile.control_api_port | default((profile.mode == 'lite') | ternary(39111, 39101)) }}/health" method: GET status_code: 200 register: profile_health diff --git a/roles/openclaw_control_plane/templates/control-plane.env.j2 b/roles/openclaw_control_plane/templates/control-plane.env.j2 index abe55ea..1994589 100644 --- a/roles/openclaw_control_plane/templates/control-plane.env.j2 +++ b/roles/openclaw_control_plane/templates/control-plane.env.j2 @@ -13,6 +13,8 @@ ROUTER_FORCED_AGENT={{ profile.router_forced_agent | default('') }} WORKER_EXEC_MODE={{ profile.worker_exec_mode | default('stub') }} OPENCLAW_BIN={{ profile.openclaw_bin | default('/home/openclaw/.local/bin/openclaw') }} OPENCLAW_HOME={{ profile.openclaw_home | default('/home/openclaw') }} +OPENCLAW_UID={{ profile.openclaw_worker_uid | default(openclaw_control_plane_worker_uid) }} +OPENCLAW_GID={{ profile.openclaw_worker_gid | default(openclaw_control_plane_worker_gid) }} OPENCLAW_ENV_FILE={{ profile.openclaw_env_file | default('/etc/openclaw/secrets/' ~ (profile.gateway_profile | default(profile.name)) ~ '.env') }} OPENCLAW_GATEWAY_TOKEN={{ profile.openclaw_gateway_token | default('') }} OPENCLAW_TIMEOUT_MS={{ profile.openclaw_timeout_ms | default(120000) }} diff --git a/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 b/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 index f6cb4e4..23d42e9 100644 --- a/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 +++ b/roles/openclaw_control_plane/templates/docker-compose.full.yml.j2 @@ -81,7 +81,7 @@ services: OPENCLAW_TIMEOUT_MS: ${OPENCLAW_TIMEOUT_MS} OPENCLAW_BUNDLED_PLUGINS_DIR: ${OPENCLAW_BUNDLED_PLUGINS_DIR} {% if profile.worker_exec_mode | default('stub') == 'openclaw' %} - user: "994:994" + user: "${OPENCLAW_UID}:${OPENCLAW_GID}" volumes: - /home/openclaw:/home/openclaw {% endif %} @@ -107,7 +107,7 @@ services: OPENCLAW_TIMEOUT_MS: ${OPENCLAW_TIMEOUT_MS} OPENCLAW_BUNDLED_PLUGINS_DIR: ${OPENCLAW_BUNDLED_PLUGINS_DIR} {% if profile.worker_exec_mode | default('stub') == 'openclaw' %} - user: "994:994" + user: "${OPENCLAW_UID}:${OPENCLAW_GID}" volumes: - /home/openclaw:/home/openclaw {% endif %} @@ -136,7 +136,7 @@ services: {% if profile.worker_exec_mode | default('stub') == 'openclaw' %} network_mode: host shm_size: "1gb" - user: "994:994" + user: "${OPENCLAW_UID}:${OPENCLAW_GID}" volumes: - /home/openclaw:/home/openclaw {% endif %} @@ -162,7 +162,7 @@ services: OPENCLAW_TIMEOUT_MS: ${OPENCLAW_TIMEOUT_MS} OPENCLAW_BUNDLED_PLUGINS_DIR: ${OPENCLAW_BUNDLED_PLUGINS_DIR} {% if profile.worker_exec_mode | default('stub') == 'openclaw' %} - user: "994:994" + user: "${OPENCLAW_UID}:${OPENCLAW_GID}" volumes: - /home/openclaw:/home/openclaw {% endif %} diff --git a/roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2 b/roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2 index 8c6783b..08a0c0b 100644 --- a/roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2 +++ b/roles/openclaw_control_plane/templates/docker-compose.lite.yml.j2 @@ -65,7 +65,7 @@ services: OPENCLAW_TIMEOUT_MS: ${OPENCLAW_TIMEOUT_MS} OPENCLAW_BUNDLED_PLUGINS_DIR: ${OPENCLAW_BUNDLED_PLUGINS_DIR} {% if profile.worker_exec_mode | default('stub') == 'openclaw' %} - user: "994:994" + user: "${OPENCLAW_UID}:${OPENCLAW_GID}" volumes: - /home/openclaw:/home/openclaw {% endif %} diff --git a/roles/openclaw_enterprise/templates/openclaw-gateway-profile.service.j2 b/roles/openclaw_enterprise/templates/openclaw-gateway-profile.service.j2 index 281493f..5054271 100644 --- a/roles/openclaw_enterprise/templates/openclaw-gateway-profile.service.j2 +++ b/roles/openclaw_enterprise/templates/openclaw-gateway-profile.service.j2 @@ -16,7 +16,7 @@ Environment="PATH={{ openclaw_home }}/.local/bin:{{ openclaw_home }}/.local/shar Environment="HOME={{ openclaw_home }}" Environment="XDG_RUNTIME_DIR=/run/user/{{ openclaw_uid_value | default('1000') }}" -ExecStart=/bin/sh {{ openclaw_enterprise_openclaw_bin }} --profile {{ profile.name }} gateway --bind {{ profile.gateway_bind | default('loopback') }} --port {{ profile.gateway_port }} +ExecStart={{ openclaw_enterprise_openclaw_bin }} --profile {{ profile.name }} gateway --bind {{ profile.gateway_bind | default('loopback') }} --port {{ profile.gateway_port }} Restart=always RestartSec=5 NoNewPrivileges=true From f680eb649b23e9804c4dbfa7ff1aaeba350b131c Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 20:47:21 -0300 Subject: [PATCH 13/16] docs: rewrite README as ansible base protocol guide --- README.md | 480 +++++++++++++++++++++++------------------------------- 1 file changed, 202 insertions(+), 278 deletions(-) diff --git a/README.md b/README.md index 2584b43..a24aa09 100644 --- a/README.md +++ b/README.md @@ -1,347 +1,271 @@ -# OpenClaw Ansible Installer +# OpenClaw Ansible Base Protocol [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Lint](https://github.com/openclaw/openclaw-ansible/actions/workflows/lint.yml/badge.svg)](https://github.com/openclaw/openclaw-ansible/actions/workflows/lint.yml) [![Ansible](https://img.shields.io/badge/Ansible-2.14+-blue.svg)](https://www.ansible.com/) [![Multi-OS](https://img.shields.io/badge/OS-Debian%20%7C%20Ubuntu%20%7C%20Fedora-orange.svg)](https://www.debian.org/) -Automated, hardened installation of [OpenClaw](https://github.com/openclaw/openclaw) with Docker and Tailscale VPN support for Debian/Ubuntu Linux. +Base operativa en Ansible para desplegar y operar OpenClaw en modo enterprise, con perfiles múltiples, control-plane Stage 2, sincronización no interactiva de credenciales Codex y flujo de operaciones reproducible. -## ⚠️ macOS Support: Deprecated & Disabled +> Este repositorio **no es el producto OpenClaw final**. +> Es la **capa base de infraestructura/protocolo de despliegue** para instalar, reconciliar, purgar, validar y operar entornos OpenClaw de forma consistente. -**Effective 2026-02-06, support for bare-metal macOS installations has been removed from this playbook.** +## Descripción Corta Sugerida del Repositorio -### Why? -The underlying project currently requires system-level permissions and configurations that introduce significant security risks when executed on a primary host OS. To protect user data and system integrity, we have disabled bare-metal execution. +Si quieres actualizar la descripción en GitHub, puedes usar esta frase: -### What does this mean? -* The playbook will now explicitly fail if run on a `Darwin` (macOS) system. -* We strongly discourage manual workarounds to bypass this check. -* **Future Support:** We are evaluating a virtualization-first strategy (using Vagrant or Docker) to provide a sandboxed environment for this project in the future. +`Base Ansible para OpenClaw Enterprise + Stage 2 Control Plane (NATS/NestJS), con auth-sync Codex, smoke tests y operación day-2 reproducible.` -## Features +## Qué Es y Qué No Es -- 🔒 **Firewall-first**: UFW firewall + Docker isolation -- 🛡️ **Fail2ban**: SSH brute-force protection out of the box -- 🔄 **Auto-updates**: Automatic security patches via unattended-upgrades -- 🔐 **Tailscale VPN**: Secure remote access without exposing services -- 🐳 **Docker**: Docker CE with security hardening -- 🚀 **One-command install**: Complete setup in minutes -- 🔧 **Auto-configuration**: DBus, systemd, environment setup -- 📦 **pnpm installation**: Uses `pnpm install -g openclaw@latest` +### Sí es -## Quick Start +- Un blueprint de infraestructura para OpenClaw. +- Un conjunto de roles Ansible reutilizables (`openclaw`, `openclaw_enterprise`, `openclaw_control_plane`, `openclaw_cloudflare_tunnel`). +- Un workflow operativo con `Makefile` + `ops/*.sh` para day-0/day-1/day-2. +- Un paquete Stage 2 full/lite para colas, enrutamiento, workers y observabilidad. -### Release Mode (Recommended) +### No es -Install the latest stable version from npm: +- Una app monolítica única de negocio. +- Un reemplazo del repositorio principal de OpenClaw. +- Un instalador "one-click" sin decisiones operativas: aquí se orquesta infraestructura real con perfiles, secretos y reglas de operación. -```bash -curl -fsSL https://raw.githubusercontent.com/openclaw/openclaw-ansible/main/install.sh | bash -``` +## Novedades Relevantes de Esta Base -### Development Mode +- Operación estandarizada con `make backup/purge/install/auth-sync/smoke/reinstall`. +- `auth-sync` no interactivo para Codex usando credenciales de `/home/efra/.codex`. +- Stage 2 control-plane en dos modos: + - `full` (ejemplo: `efra-core`) + - `lite` (ejemplo: `andrea`) +- Exposición opcional por Cloudflare Tunnel de endpoints locales. +- Documentación de layout instalado con permisos detallados y diagramas Mermaid. +- Endurecimientos recientes en despliegue y control-plane: + - `ExecStart` directo en systemd (sin wrapper shell innecesario). + - Healthcheck de control-api alineado con puertos host por modo (`full`/`lite`). + - UID/GID de workers parametrizado (sin hardcode `994:994`). + - `confirm/reject` ahora actualiza estado en DB (`needs_confirmation=false`). + - Escape seguro de contraseña en reconciliación SQL de Postgres. -Install from source for development or testing: +## Arquitectura (Vista Rápida) -```bash -# Clone the installer -git clone https://github.com/openclaw/openclaw-ansible.git -cd openclaw-ansible - -# Install in development mode -ansible-playbook playbook.yml --ask-become-pass -e openclaw_install_mode=development -``` +```mermaid +flowchart LR + A[Ansible / Makefile / ops scripts] --> B[openclaw role] + A --> C[openclaw_enterprise role] + A --> D[openclaw_control_plane role] + A --> E[openclaw_cloudflare_tunnel role] -## What Gets Installed + C --> F[Gateway profile dev-main] + C --> G[Gateway profile andrea] -- Tailscale (mesh VPN) -- UFW firewall (SSH + Tailscale ports only) -- Docker CE + Compose V2 (for sandboxes) -- Node.js 22.x + pnpm -- OpenClaw on host (not containerized) -- Systemd service (auto-start) - -## Post-Install - -After installation completes, switch to the openclaw user: - -```bash -sudo su - openclaw -``` + D --> H[efra-core full] + D --> I[andrea lite] -Then run the quick-start onboarding wizard: + H --> H1[NATS + Postgres] + H --> H2[Ingress + Router + Broker] + H --> H3[Workers main/research/browser-login/coolify-ops] + H --> H4[Control API + Prometheus + Grafana + Uptime Kuma] -```bash -openclaw onboard --install-daemon + I --> I1[NATS + Postgres] + I --> I2[Ingress + Router forced main + Worker main + Broker + Control API] ``` -This will: -- Guide you through the setup wizard -- Configure your messaging provider (WhatsApp/Telegram/Signal) -- Install and start the daemon service - -### Alternative Manual Setup - -```bash -# Configure manually -openclaw configure - -# Login to provider -openclaw providers login - -# Test gateway -openclaw gateway - -# Install as daemon -openclaw daemon install -openclaw daemon start - -# Check status -openclaw status -openclaw logs +## Flujo de Mensaje (Telegram/API -> Agente -> Resultado) + +```mermaid +sequenceDiagram + autonumber + participant ING as ingress + participant NATS as NATS JetStream + participant RT as router + participant WK as worker- + participant BR as broker + participant PG as postgres + participant API as control-api + + ING->>NATS: publish tasks.ingress + RT->>NATS: consume tasks.ingress + RT->>NATS: publish tasks.agent. + WK->>NATS: consume tasks.agent. + WK->>NATS: publish results.agent. + BR->>NATS: consume results.agent.* + BR->>PG: upsert tasks + insert task_events + API->>PG: GET /tasks, POST /tasks/:id/confirm|reject ``` -## Installation Modes +## Perfiles de Referencia (Inventario `dev`) -### Release Mode (Default) -- Installs via `pnpm install -g openclaw@latest` -- Gets latest stable version from npm registry -- Automatic updates via `pnpm install -g openclaw@latest` -- **Recommended for production** +Configurados actualmente en `inventories/dev/group_vars/all.yml`: -### Development Mode -- Clones from `https://github.com/openclaw/openclaw.git` -- Builds from source with `pnpm build` -- Symlinks binary to `~/.local/bin/openclaw` -- Adds helpful aliases: - - `openclaw-rebuild` - Rebuild after code changes - - `openclaw-dev` - Navigate to repo directory - - `openclaw-pull` - Pull, install deps, and rebuild -- **Recommended for development and testing** +- Gateway Enterprise: + - `dev-main` en `127.0.0.1:19011` con agentes `main/research/browser-login/coolify-ops`. + - `andrea` en `127.0.0.1:19031` con agente `main`. +- Control-plane Stage 2: + - `efra-core` modo `full` (`ingress=30101`, `control-api=39101`, `grafana=31001`, `prometheus=39091`). + - `andrea` modo `lite` (`ingress=30111`, `control-api=39111`). -Enable with: `-e openclaw_install_mode=development` +## Operación Recomendada (Day-2) -## Security - -- **Public ports**: SSH (22), Tailscale (41641/udp) only -- **Fail2ban**: SSH brute-force protection (5 attempts → 1 hour ban) -- **Automatic updates**: Security patches via unattended-upgrades -- **Docker isolation**: Containers can't expose ports externally (DOCKER-USER chain) -- **Non-root**: OpenClaw runs as unprivileged user -- **Scoped sudo**: Limited to service management (not full root) -- **Systemd hardening**: NoNewPrivileges, PrivateTmp, ProtectSystem - -Verify: `nmap -p- YOUR_SERVER_IP` should show only port 22 open. - -### Security Note - -For high-security environments, audit before running: +Desde la raíz del repo: ```bash -git clone https://github.com/openclaw/openclaw-ansible.git -cd openclaw-ansible -# Review playbook.yml and roles/ -ansible-playbook playbook.yml --check --diff # Dry run -ansible-playbook playbook.yml --ask-become-pass -``` - -## Documentation - -- [Operator Runbook](docs/operator-runbook.md) - End-to-end profile/agent/auth-sync/queue operations guide -- [Configuration Guide](docs/configuration.md) - All configuration options -- [Development Mode](docs/development-mode.md) - Build from source -- [Security Architecture](docs/security.md) - Security details -- [Technical Details](docs/architecture.md) - Architecture overview -- [Enterprise Deployment](docs/enterprise-deployment.md) - Multi-profile deployment -- [Stage 2 Control Plane](docs/control-plane-stage2.md) - NATS + NestJS full/lite package -- [Cloudflare Tunnel Exposure](docs/cloudflare-tunnel.md) - Subdomain publishing for local services -- [Operations Workflow](docs/operations-workflow.md) - Backup/purge/install with Makefile -- [Troubleshooting](docs/troubleshooting.md) - Common issues -- [Agent Guidelines](AGENTS.md) - AI agent instructions - -## Operations Workflow (Makefile) - -For repeatable day-2 operations (backup, clean reinstall, smoke checks), use: - -```bash -cd openclaw-ansible - -# Backup current state make backup - -# Purge runtime state (requires explicit confirmation) make purge CONFIRM=1 - -# Reinstall enterprise + stage2 control-plane make install - -# Reconcile only Cloudflare tunnel/service (if enabled in inventory) -make cloudflare - -# Non-interactive Codex credential sync make auth-sync PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex -# legacy alias (same behavior) -make oauth-login PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex - -# Validate full flow make smoke - -# One-shot full cycle -make reinstall CONFIRM=1 ``` -## Requirements - -- Debian 11+ or Ubuntu 20.04+ or Fedora 40+ -- Root/sudo access -- Internet connection - -## What Gets Installed - -- Tailscale (mesh VPN) -- UFW firewall (SSH + Tailscale ports only) -- Docker CE + Compose V2 (for sandboxes) -- Node.js 22.x + pnpm -- OpenClaw on host (not containerized) -- Systemd service (auto-start) - -## Manual Installation - -### Release Mode (Default) +Ciclo completo en una sola orden: ```bash -# Install dependencies -sudo apt update && sudo apt install -y ansible git - -# Clone repository -git clone https://github.com/openclaw/openclaw-ansible.git -cd openclaw-ansible - -# Install Ansible collections -ansible-galaxy collection install -r requirements.yml - -# Run installation -./run-playbook.sh +make reinstall CONFIRM=1 ``` -### Development Mode - -Build from source for development: - -```bash -# Same as above, but with development mode flag -./run-playbook.sh -e openclaw_install_mode=development - -# Or directly: -ansible-playbook playbook.yml --ask-become-pass -e openclaw_install_mode=development +## Targets del Makefile + +| Target | Propósito | +|---|---| +| `make backup` | Respalda estado conocido de OpenClaw + control-plane | +| `make purge CONFIRM=1` | Purga estado runtime (destructivo) | +| `make install` | Reconciliación enterprise + control-plane | +| `make secrets-refactor` | Genera archivo manual para migrar/normalizar secretos | +| `make cloudflare` | Reconciliación exclusiva de tunnel/cloudflared | +| `make auth-sync` | Sincroniza credenciales Codex a perfiles/agentes | +| `make oauth-login` | Alias legado de `make auth-sync` | +| `make smoke` | Pruebas de salud y flujo de cola | +| `make reinstall CONFIRM=1` | `backup + purge + install + smoke` | + +Variables principales: + +- `ENV` (default `dev`) +- `INVENTORY` (default `inventories//hosts.yml`) +- `LIMIT` (default `zennook`) +- `PROFILES` (default `dev-main andrea`) +- `OAUTH_PROVIDER` (default `openai-codex`) +- `MODEL_REF` (default `openai-codex/gpt-5.3-codex`) + +## Auth Sync Codex (No Interactivo) + +`ops/auth-sync.sh` realiza este pipeline: + +1. Lee credenciales fuente (por defecto): + - `/home/efra/.codex/auth.json` + - `/home/efra/.codex/auth-andrea.json` +2. Copia credenciales a: + - `/home/openclaw/.codex/auth.json` + - `/home/openclaw/.codex/auth-andrea.json` +3. Escribe `auth-profiles.json` por agente en cada perfil destino. +4. Ajusta modelo por perfil con: + - `openclaw --profile models set ` + +Sobrescrituras opcionales (cargadas desde `/home/efra/.env` si existe): + +- `EFRA_CODEX_HOME` +- `EFRA_CODEX_AUTH_DEFAULT` +- `EFRA_CODEX_AUTH_ANDREA` + +## Smoke, Regresión e Idempotencia + +### Smoke operativo + +`make smoke` valida, entre otros: + +- Estado de stacks Docker Compose esperados. +- Endpoints de salud (`/health`) en ingress y control-api. +- Flujo de cola con `/ingress/simulate` hasta estado terminal en control-api. + +### Harness de regresión + +Existe harness Docker CI en `tests/run-tests.sh` con 3 fases: + +1. Convergencia. +2. Verificación. +3. Idempotencia. + +Estado observado en ejecución del 2026-03-01: + +- Convergencia: `PASS` +- Verificación: `PASS` +- Idempotencia: `FAIL` por 1 cambio en tarea no relacionada a control-plane (`Ensure pnpm directories have correct ownership`). + +## Estructura del Repositorio + +```text +. +├── playbook.yml # instalación base local (role openclaw) +├── playbooks/ +│ ├── enterprise.yml # despliegue enterprise multi-perfil +│ └── control-plane-only.yml # reconciliación dedicada de control-plane +├── roles/ +│ ├── openclaw +│ ├── openclaw_enterprise +│ ├── openclaw_control_plane +│ └── openclaw_cloudflare_tunnel +├── control-plane/ # servicios NestJS Stage 2 +├── inventories/ # dev/staging/prod/research +├── ops/ # scripts operativos usados por Makefile +├── docs/ # runbooks, arquitectura, troubleshooting +└── tests/ # harness Docker de convergencia/verificación/idempotencia ``` -This will: -- Clone openclaw repo to `~/code/openclaw` -- Run `pnpm install` and `pnpm build` -- Symlink binary to `~/.local/bin/openclaw` -- Add development aliases to `.bashrc` - -## Configuration Options +## Seguridad y Permisos -All configuration variables can be found in [`roles/openclaw/defaults/main.yml`](roles/openclaw/defaults/main.yml). +Controles principales que deja esta base: -You can override them in three ways: +- Usuario no root para OpenClaw (`openclaw`). +- Secretos por perfil bajo `/etc/openclaw/secrets/*.env`. +- Servicios systemd por perfil de gateway. +- Aislamiento de runtime con Docker para control-plane. +- Endpoints en loopback y exposición opcional por tunnel. -### 1. Via Command Line +Para layout completo con rutas y permisos (`owner:group` + `mode`), revisa: -```bash -ansible-playbook playbook.yml --ask-become-pass \ - -e openclaw_install_mode=development \ - -e "openclaw_ssh_keys=['ssh-ed25519 AAAAC3... user@host']" -``` - -### 2. Via Variables File +- [Installed Runtime Layout](docs/architecture-installed-layout.md) -```bash -# Create vars.yml -cat > vars.yml << EOF -openclaw_install_mode: development -openclaw_ssh_keys: - - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGxxxxxxxx user@host" - - "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAAB... user@host" -openclaw_repo_url: "https://github.com/YOUR_USERNAME/openclaw.git" -openclaw_repo_branch: "feature-branch" -tailscale_authkey: "tskey-auth-xxxxxxxxxxxxx" -EOF - -# Use it -ansible-playbook playbook.yml --ask-become-pass -e @vars.yml -``` +## Sistemas Operativos Soportados -### 3. Edit Defaults Directly +- Debian +- Ubuntu +- Fedora -Edit `roles/openclaw/defaults/main.yml` before running the playbook. +### Estado de macOS -### Available Variables +La ejecución bare-metal en macOS está deshabilitada en este repo. +El playbook falla explícitamente en `Darwin` para evitar instalación insegura fuera del modelo soportado. -| Variable | Default | Description | -|----------|---------|-------------| -| `openclaw_user` | `openclaw` | System user name | -| `openclaw_home` | `/home/openclaw` | User home directory | -| `openclaw_install_mode` | `release` | `release` or `development` | -| `openclaw_ssh_keys` | `[]` | List of SSH public keys | -| `openclaw_repo_url` | `https://github.com/openclaw/openclaw.git` | Git repository (dev mode) | -| `openclaw_repo_branch` | `main` | Git branch (dev mode) | -| `tailscale_authkey` | `""` | Tailscale auth key for auto-connect | -| `nodejs_version` | `22.x` | Node.js version to install | +## Documentación Clave -See [`roles/openclaw/defaults/main.yml`](roles/openclaw/defaults/main.yml) for the complete list. +- [Operator Runbook](docs/operator-runbook.md) +- [Operations Workflow](docs/operations-workflow.md) +- [Stage 2 Control Plane](docs/control-plane-stage2.md) +- [Enterprise Deployment](docs/enterprise-deployment.md) +- [Installed Runtime Layout](docs/architecture-installed-layout.md) +- [Cloudflare Tunnel](docs/cloudflare-tunnel.md) +- [Troubleshooting](docs/troubleshooting.md) +- [Configuration Guide](docs/configuration.md) +- [Security Architecture](docs/security.md) +- [Agent Guidelines](AGENTS.md) -### Common Configuration Examples - -#### SSH Keys for Remote Access +## Instalación Manual (Si No Usas Make) ```bash -ansible-playbook playbook.yml --ask-become-pass \ - -e "openclaw_ssh_keys=['ssh-ed25519 AAAAC3... user@host']" -``` - -#### Development Mode with Custom Repository - -```bash -ansible-playbook playbook.yml --ask-become-pass \ - -e openclaw_install_mode=development \ - -e openclaw_repo_url=https://github.com/YOUR_USERNAME/openclaw.git \ - -e openclaw_repo_branch=feature-branch +ansible-galaxy collection install -r requirements.yml +ansible-playbook -i inventories/dev/hosts.yml playbooks/enterprise.yml -l zennook --become ``` -#### Tailscale Auto-Connect +Para instalación base local mínima: ```bash -ansible-playbook playbook.yml --ask-become-pass \ - -e tailscale_authkey=tskey-auth-xxxxxxxxxxxxx +ansible-playbook playbook.yml --become ``` -## License +## Licencia -MIT - see [LICENSE](LICENSE) +MIT. Ver [LICENSE](LICENSE). -## Support +## Referencias - OpenClaw: https://github.com/openclaw/openclaw -- This installer: https://github.com/openclaw/openclaw-ansible/issues - -## Enterprise Multi-Environment Deployment - -For multi-node environments with profile isolation and model/provider routing, use: - -- `playbooks/enterprise.yml` -- `inventories/dev|staging|prod|research` -- `roles/openclaw_enterprise` -- `run-enterprise-playbook.sh` - -Guide: `docs/enterprise-deployment.md` - -Android companion nodes are supported in this topology as gateway-paired WS nodes -(inventory metadata group: `openclaw_mobile_nodes`; operational flow via -`openclaw nodes pending|approve|status` on a gateway host). - -A dedicated `browser-login` agent is also included in enterprise profile examples: -browser-only tool policy and `openclaw` managed browser profile for manual login flows. +- Issues de esta base: https://github.com/openclaw/openclaw-ansible/issues From 7ccca6b828203c31200f4338991e5fee975975f5 Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 21:00:32 -0300 Subject: [PATCH 14/16] docs: rebrand suite narrative and rewrite core operational docs --- README.md | 281 +++++++++++++--------------------- control-plane/README.md | 78 +++++++--- docs/architecture.md | 180 +++++++++------------- docs/control-plane-stage2.md | 108 +++++-------- docs/enterprise-deployment.md | 163 +++++++------------- docs/operations-workflow.md | 97 +++++------- 6 files changed, 374 insertions(+), 533 deletions(-) diff --git a/README.md b/README.md index a24aa09..90995eb 100644 --- a/README.md +++ b/README.md @@ -1,113 +1,96 @@ -# OpenClaw Ansible Base Protocol +# ClawOps Protocol Suite (Ansible Base) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Lint](https://github.com/openclaw/openclaw-ansible/actions/workflows/lint.yml/badge.svg)](https://github.com/openclaw/openclaw-ansible/actions/workflows/lint.yml) [![Ansible](https://img.shields.io/badge/Ansible-2.14+-blue.svg)](https://www.ansible.com/) [![Multi-OS](https://img.shields.io/badge/OS-Debian%20%7C%20Ubuntu%20%7C%20Fedora-orange.svg)](https://www.debian.org/) -Base operativa en Ansible para desplegar y operar OpenClaw en modo enterprise, con perfiles múltiples, control-plane Stage 2, sincronización no interactiva de credenciales Codex y flujo de operaciones reproducible. +Suite operativa para llevar OpenClaw a un estándar de despliegue enterprise: perfiles múltiples, colas Stage 2, auth-sync centralizado, smoke tests y protocolos day-2 reproducibles. -> Este repositorio **no es el producto OpenClaw final**. -> Es la **capa base de infraestructura/protocolo de despliegue** para instalar, reconciliar, purgar, validar y operar entornos OpenClaw de forma consistente. +## Por Qué Nace Esta Suite -## Descripción Corta Sugerida del Repositorio +Nace para resolver un problema operativo concreto: OpenClaw funciona como producto, pero en entornos reales faltaba una capa robusta de infraestructura y protocolo para operar múltiples perfiles y agentes de forma repetible. -Si quieres actualizar la descripción en GitHub, puedes usar esta frase: +Esta suite aparece para cerrar la brecha entre: -`Base Ansible para OpenClaw Enterprise + Stage 2 Control Plane (NATS/NestJS), con auth-sync Codex, smoke tests y operación day-2 reproducible.` +- "funciona en una máquina" y "opera estable en equipos/ambientes". +- "instalación manual" y "ciclo completo backup-purge-install-smoke". +- "credenciales dispersas" y "auth-sync controlado por perfil/agente". +- "ejecución sin trazabilidad" y "observabilidad/control con API y eventos". -## Qué Es y Qué No Es +## Falencias Que Cubre (Frente a Uso Base de OpenClaw) -### Sí es +1. Falta de protocolo multi-perfil/multi-agente: se añade `openclaw_enterprise` y servicios por perfil. +2. Falta de orquestación de colas y control central: se añade Stage 2 (`ingress/router/worker/broker/control-api`) con NATS+Postgres. +3. Falta de sincronización de credenciales a escala: se añade `make auth-sync` con escritura de `auth-profiles.json` por agente. +4. Falta de operación day-2 unificada: se estandariza `make backup/purge/install/smoke/reinstall`. +5. Falta de validación post-despliegue: se añade smoke de salud + flujo de cola terminal. +6. Falta de visibilidad en full mode: se integra Prometheus/Grafana/Uptime Kuma. -- Un blueprint de infraestructura para OpenClaw. -- Un conjunto de roles Ansible reutilizables (`openclaw`, `openclaw_enterprise`, `openclaw_control_plane`, `openclaw_cloudflare_tunnel`). -- Un workflow operativo con `Makefile` + `ops/*.sh` para day-0/day-1/day-2. -- Un paquete Stage 2 full/lite para colas, enrutamiento, workers y observabilidad. +## Qué Es y Qué No Es -### No es +### Qué es -- Una app monolítica única de negocio. -- Un reemplazo del repositorio principal de OpenClaw. -- Un instalador "one-click" sin decisiones operativas: aquí se orquesta infraestructura real con perfiles, secretos y reglas de operación. +- Base Ansible de despliegue y operación (protocolo operativo). +- Suite de automatización para OpenClaw en escenarios enterprise. +- Capa de estandarización para equipos DevOps/Platform. -## Novedades Relevantes de Esta Base +### Qué no es -- Operación estandarizada con `make backup/purge/install/auth-sync/smoke/reinstall`. -- `auth-sync` no interactivo para Codex usando credenciales de `/home/efra/.codex`. -- Stage 2 control-plane en dos modos: - - `full` (ejemplo: `efra-core`) - - `lite` (ejemplo: `andrea`) -- Exposición opcional por Cloudflare Tunnel de endpoints locales. -- Documentación de layout instalado con permisos detallados y diagramas Mermaid. -- Endurecimientos recientes en despliegue y control-plane: - - `ExecStart` directo en systemd (sin wrapper shell innecesario). - - Healthcheck de control-api alineado con puertos host por modo (`full`/`lite`). - - UID/GID de workers parametrizado (sin hardcode `994:994`). - - `confirm/reject` ahora actualiza estado en DB (`needs_confirmation=false`). - - Escape seguro de contraseña en reconciliación SQL de Postgres. +- No reemplaza el repositorio principal de OpenClaw. +- No es una reescritura del core de OpenClaw. +- No es un instalador "one-click" opaco; es infraestructura explícita y auditable. -## Arquitectura (Vista Rápida) +## Identidad de la Suite -```mermaid -flowchart LR - A[Ansible / Makefile / ops scripts] --> B[openclaw role] - A --> C[openclaw_enterprise role] - A --> D[openclaw_control_plane role] - A --> E[openclaw_cloudflare_tunnel role] +Nombre operativo recomendado: `ClawOps Protocol Suite`. - C --> F[Gateway profile dev-main] - C --> G[Gateway profile andrea] +Descripción corta recomendada para GitHub: - D --> H[efra-core full] - D --> I[andrea lite] +`Suite Ansible para operación enterprise de OpenClaw con Stage 2 Control Plane (NATS/NestJS), auth-sync Codex, smoke tests y ciclo day-2 reproducible.` - H --> H1[NATS + Postgres] - H --> H2[Ingress + Router + Broker] - H --> H3[Workers main/research/browser-login/coolify-ops] - H --> H4[Control API + Prometheus + Grafana + Uptime Kuma] +## Arquitectura Rápida - I --> I1[NATS + Postgres] - I --> I2[Ingress + Router forced main + Worker main + Broker + Control API] +```mermaid +flowchart LR + A[Makefile + ops scripts] --> B[playbooks enterprise/control-plane] + B --> C[role: openclaw] + B --> D[role: openclaw_enterprise] + B --> E[role: openclaw_control_plane] + B --> F[role: openclaw_cloudflare_tunnel] + + D --> G[Gateway profiles] + E --> H[Stage 2 full/lite] + F --> I[Cloudflare loopback exposure opcional] + + H --> H1[NATS JetStream] + H --> H2[Postgres] + H --> H3[Ingress/Router/Workers/Broker] + H --> H4[Control API] + H --> H5[Prom/Grafana/Kuma full mode] ``` -## Flujo de Mensaje (Telegram/API -> Agente -> Resultado) +## Flujo Mensajería/Cola ```mermaid sequenceDiagram autonumber participant ING as ingress - participant NATS as NATS JetStream participant RT as router + participant NATS as NATS participant WK as worker- participant BR as broker participant PG as postgres participant API as control-api - ING->>NATS: publish tasks.ingress - RT->>NATS: consume tasks.ingress - RT->>NATS: publish tasks.agent. - WK->>NATS: consume tasks.agent. - WK->>NATS: publish results.agent. - BR->>NATS: consume results.agent.* - BR->>PG: upsert tasks + insert task_events - API->>PG: GET /tasks, POST /tasks/:id/confirm|reject + ING->>NATS: tasks.ingress + RT->>NATS: tasks.agent. + WK->>NATS: results.agent. + BR->>PG: upsert tasks + task_events + API->>PG: consulta + confirm/reject ``` -## Perfiles de Referencia (Inventario `dev`) - -Configurados actualmente en `inventories/dev/group_vars/all.yml`: - -- Gateway Enterprise: - - `dev-main` en `127.0.0.1:19011` con agentes `main/research/browser-login/coolify-ops`. - - `andrea` en `127.0.0.1:19031` con agente `main`. -- Control-plane Stage 2: - - `efra-core` modo `full` (`ingress=30101`, `control-api=39101`, `grafana=31001`, `prometheus=39091`). - - `andrea` modo `lite` (`ingress=30111`, `control-api=39111`). - -## Operación Recomendada (Day-2) - -Desde la raíz del repo: +## Operación Recomendada ```bash make backup @@ -117,155 +100,97 @@ make auth-sync PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex make smoke ``` -Ciclo completo en una sola orden: +Ciclo completo: ```bash make reinstall CONFIRM=1 ``` -## Targets del Makefile +## Targets Operativos | Target | Propósito | |---|---| -| `make backup` | Respalda estado conocido de OpenClaw + control-plane | -| `make purge CONFIRM=1` | Purga estado runtime (destructivo) | +| `make backup` | Respaldo de estado conocido | +| `make purge CONFIRM=1` | Purga runtime (destructivo) | | `make install` | Reconciliación enterprise + control-plane | -| `make secrets-refactor` | Genera archivo manual para migrar/normalizar secretos | -| `make cloudflare` | Reconciliación exclusiva de tunnel/cloudflared | -| `make auth-sync` | Sincroniza credenciales Codex a perfiles/agentes | -| `make oauth-login` | Alias legado de `make auth-sync` | -| `make smoke` | Pruebas de salud y flujo de cola | -| `make reinstall CONFIRM=1` | `backup + purge + install + smoke` | - -Variables principales: - -- `ENV` (default `dev`) -- `INVENTORY` (default `inventories//hosts.yml`) -- `LIMIT` (default `zennook`) -- `PROFILES` (default `dev-main andrea`) -- `OAUTH_PROVIDER` (default `openai-codex`) -- `MODEL_REF` (default `openai-codex/gpt-5.3-codex`) - -## Auth Sync Codex (No Interactivo) - -`ops/auth-sync.sh` realiza este pipeline: - -1. Lee credenciales fuente (por defecto): - - `/home/efra/.codex/auth.json` - - `/home/efra/.codex/auth-andrea.json` -2. Copia credenciales a: - - `/home/openclaw/.codex/auth.json` - - `/home/openclaw/.codex/auth-andrea.json` -3. Escribe `auth-profiles.json` por agente en cada perfil destino. -4. Ajusta modelo por perfil con: - - `openclaw --profile models set ` - -Sobrescrituras opcionales (cargadas desde `/home/efra/.env` si existe): - -- `EFRA_CODEX_HOME` -- `EFRA_CODEX_AUTH_DEFAULT` -- `EFRA_CODEX_AUTH_ANDREA` +| `make secrets-refactor` | Genera base de migración de secretos | +| `make cloudflare` | Reconciliación exclusiva del túnel | +| `make auth-sync` | Sincroniza credenciales Codex por perfil/agente | +| `make oauth-login` | Alias legado de auth-sync | +| `make smoke` | Prueba de salud + flujo de cola | +| `make reinstall CONFIRM=1` | Ciclo end-to-end | -## Smoke, Regresión e Idempotencia +Variables clave: -### Smoke operativo +- `ENV` +- `INVENTORY` +- `LIMIT` +- `PROFILES` +- `OAUTH_PROVIDER` +- `MODEL_REF` -`make smoke` valida, entre otros: +## Auth-Sync No Interactivo -- Estado de stacks Docker Compose esperados. -- Endpoints de salud (`/health`) en ingress y control-api. -- Flujo de cola con `/ingress/simulate` hasta estado terminal en control-api. +`ops/auth-sync.sh`: -### Harness de regresión +1. Lee credenciales fuente (por defecto `/home/efra/.codex/*`). +2. Copia credenciales a `/home/openclaw/.codex`. +3. Escribe `auth-profiles.json` por agente en perfiles destino. +4. Fija modelo por perfil con `openclaw --profile models set `. -Existe harness Docker CI en `tests/run-tests.sh` con 3 fases: +Overrides vía `/home/efra/.env`: -1. Convergencia. -2. Verificación. -3. Idempotencia. +- `EFRA_CODEX_HOME` +- `EFRA_CODEX_AUTH_DEFAULT` +- `EFRA_CODEX_AUTH_ANDREA` -Estado observado en ejecución del 2026-03-01: +## Pruebas y Calidad Operativa -- Convergencia: `PASS` -- Verificación: `PASS` -- Idempotencia: `FAIL` por 1 cambio en tarea no relacionada a control-plane (`Ensure pnpm directories have correct ownership`). +- `make smoke`: salud ingress/control-api + simulación de cola hasta estado terminal. +- `tests/run-tests.sh`: convergencia/verificación/idempotencia en harness Docker. +- `ansible-playbook --syntax-check`: validación de sintaxis de playbooks. -## Estructura del Repositorio +## Estructura del Repo ```text . -├── playbook.yml # instalación base local (role openclaw) +├── playbook.yml ├── playbooks/ -│ ├── enterprise.yml # despliegue enterprise multi-perfil -│ └── control-plane-only.yml # reconciliación dedicada de control-plane ├── roles/ -│ ├── openclaw -│ ├── openclaw_enterprise -│ ├── openclaw_control_plane -│ └── openclaw_cloudflare_tunnel -├── control-plane/ # servicios NestJS Stage 2 -├── inventories/ # dev/staging/prod/research -├── ops/ # scripts operativos usados por Makefile -├── docs/ # runbooks, arquitectura, troubleshooting -└── tests/ # harness Docker de convergencia/verificación/idempotencia +├── control-plane/ +├── inventories/ +├── ops/ +├── docs/ +└── tests/ ``` -## Seguridad y Permisos +## Nota Legal Importante (MIT) -Controles principales que deja esta base: +Sí se puede modificar gran parte del repositorio, documentación y branding. -- Usuario no root para OpenClaw (`openclaw`). -- Secretos por perfil bajo `/etc/openclaw/secrets/*.env`. -- Servicios systemd por perfil de gateway. -- Aislamiento de runtime con Docker para control-plane. -- Endpoints en loopback y exposición opcional por tunnel. +Pero **no** se debe eliminar el cumplimiento de licencia MIT en copias sustanciales del software. En la práctica, eso implica mantener los avisos de licencia/copyright aplicables en los artefactos distribuidos. -Para layout completo con rutas y permisos (`owner:group` + `mode`), revisa: +Por eso, se puede crear identidad propia de suite, pero no borrar obligaciones legales de atribución/licencia. -- [Installed Runtime Layout](docs/architecture-installed-layout.md) - -## Sistemas Operativos Soportados +## Compatibilidad y SO - Debian - Ubuntu - Fedora -### Estado de macOS - -La ejecución bare-metal en macOS está deshabilitada en este repo. -El playbook falla explícitamente en `Darwin` para evitar instalación insegura fuera del modelo soportado. +macOS bare-metal está bloqueado en este repo por política de seguridad operativa. -## Documentación Clave +## Documentación Principal -- [Operator Runbook](docs/operator-runbook.md) +- [Architecture](docs/architecture.md) +- [Enterprise Deployment](docs/enterprise-deployment.md) - [Operations Workflow](docs/operations-workflow.md) +- [Operator Runbook](docs/operator-runbook.md) - [Stage 2 Control Plane](docs/control-plane-stage2.md) -- [Enterprise Deployment](docs/enterprise-deployment.md) - [Installed Runtime Layout](docs/architecture-installed-layout.md) - [Cloudflare Tunnel](docs/cloudflare-tunnel.md) - [Troubleshooting](docs/troubleshooting.md) -- [Configuration Guide](docs/configuration.md) -- [Security Architecture](docs/security.md) -- [Agent Guidelines](AGENTS.md) - -## Instalación Manual (Si No Usas Make) - -```bash -ansible-galaxy collection install -r requirements.yml -ansible-playbook -i inventories/dev/hosts.yml playbooks/enterprise.yml -l zennook --become -``` - -Para instalación base local mínima: - -```bash -ansible-playbook playbook.yml --become -``` ## Licencia MIT. Ver [LICENSE](LICENSE). - -## Referencias - -- OpenClaw: https://github.com/openclaw/openclaw -- Issues de esta base: https://github.com/openclaw/openclaw-ansible/issues diff --git a/control-plane/README.md b/control-plane/README.md index 3d31f83..bd3bdbb 100644 --- a/control-plane/README.md +++ b/control-plane/README.md @@ -1,32 +1,70 @@ -# OpenClaw Control Plane (Stage 2) +# ClawOps Stage 2 Control Plane -NestJS microservices + NATS JetStream control plane for multi-agent routing: +Control-plane de la suite operativa: microservicios NestJS + NATS JetStream para ruteo multi-agente, persistencia de estados y control de ejecución. -- `ingress`: receives Telegram/API payloads and emits `tasks.ingress` -- `router`: classifies intent and routes to `tasks.agent.` -- `worker`: executes per agent and emits `results.agent.` -- `broker`: persists results and optionally replies to Telegram -- `control-api`: task state, queue stats, confirm/reject actions +## Servicios -## Run locally +- `ingress`: recibe tráfico Telegram/API y publica `tasks.ingress`. +- `router`: clasifica y enruta a `tasks.agent.`. +- `worker`: consume por agente y publica `results.agent.`. +- `broker`: persiste resultados/eventos y puede responder a Telegram. +- `control-api`: consulta tareas, cola y decisiones (`confirm/reject`). + +## Qué Falencia Resuelve + +1. Falta de bus/eventos para tareas multi-agente. +2. Falta de estado persistente de ejecución. +3. Falta de API de control para operaciones y confirmaciones. +4. Falta de trazabilidad de eventos por tarea. + +## Contrato de Mensajes + +### Task envelope + +- `taskId` +- `profile` +- `source.channel/chatId/userId` +- `text` +- `intent` +- `targetAgent` +- `status` + +### Result envelope + +- `taskId` +- `profile` +- `fromAgent` +- `status` +- `summary` +- `fullResponse` +- `needsConfirmation` + +## Ejecución Local ```bash -npm install -npm run build -npm run start:ingress -npm run start:router -npm run start:worker -npm run start:broker -npm run start:control-api +pnpm install +pnpm run build +pnpm run start:ingress +pnpm run start:router +pnpm run start:worker +pnpm run start:broker +pnpm run start:control-api ``` -Environment variables: +## Variables de Entorno Relevantes - `OPENCLAW_PROFILE` - `NATS_URL` - `NATS_STREAM` - `POSTGRES_URL` -- `TELEGRAM_BOT_TOKEN` (optional) -- `TELEGRAM_DEFAULT_CHAT_ID` (optional) -- `ROUTER_FORCED_AGENT` (optional) -- `WORKER_AGENT_ID` (for worker service) +- `WORKER_AGENT_ID` +- `WORKER_EXEC_MODE` +- `OPENCLAW_BIN` +- `OPENCLAW_HOME` +- `OPENCLAW_ENV_FILE` +- `OPENCLAW_UID` +- `OPENCLAW_GID` + +## Nota + +Este paquete se instala y reconcilia desde Ansible (`role: openclaw_control_plane`) y forma parte de la ClawOps Protocol Suite. diff --git a/docs/architecture.md b/docs/architecture.md index 1034660..f54fea2 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,132 +1,104 @@ --- -title: Architecture -description: Technical implementation details +title: ClawOps Suite Architecture +summary: Arquitectura técnica de la suite operativa sobre OpenClaw (roles, flujos, capas y controles). --- -# Architecture +# ClawOps Suite Architecture -## Component Overview +## Objetivo Arquitectónico -``` -┌─────────────────────────────────────────┐ -│ UFW Firewall (SSH only) │ -└──────────────┬──────────────────────────┘ - │ -┌──────────────┴──────────────────────────┐ -│ DOCKER-USER Chain (iptables) │ -│ Blocks all external container access │ -└──────────────┬──────────────────────────┘ - │ -┌──────────────┴──────────────────────────┐ -│ Docker Daemon │ -│ - Non-root containers │ -│ - Localhost-only binding │ -└──────────────┬──────────────────────────┘ - │ -┌──────────────┴──────────────────────────┐ -│ OpenClaw Container │ -│ User: openclaw │ -│ Port: 127.0.0.1:3000 │ -└──────────────────────────────────────────┘ -``` +Separar claramente tres capas: -## File Structure +1. Capa producto (OpenClaw runtime). +2. Capa plataforma (Ansible roles/playbooks). +3. Capa operación (Makefile + `ops/*.sh` + smoke/runbooks). -``` -/opt/openclaw/ -├── Dockerfile -├── docker-compose.yml +Esta separación permite operación reproducible y control de drift en entornos reales. -/home/openclaw/.openclaw/ -├── config.yml -├── sessions/ -└── credentials/ +## Mapa de Componentes -/etc/systemd/system/ -└── openclaw.service +```mermaid +flowchart TB + subgraph OPS[Operation Layer] + MK[Makefile] + SH[ops/*.sh] + SM[smoke + backup/purge/install] + end -/etc/docker/ -└── daemon.json + subgraph IA[Infrastructure as Code Layer] + PB[playbooks/enterprise.yml] + R1[role openclaw] + R2[role openclaw_enterprise] + R3[role openclaw_control_plane] + R4[role openclaw_cloudflare_tunnel] + end -/etc/ufw/ -└── after.rules (DOCKER-USER chain) -``` + subgraph RT[Runtime Layer] + GW[Gateway profiles] + CP[Stage 2 Control Plane] + CF[Cloudflare tunnel opcional] + end -## Service Management + MK --> SH --> PB + PB --> R1 + PB --> R2 + PB --> R3 + PB --> R4 -OpenClaw runs as a systemd service that manages the Docker container: - -```bash -# Systemd controls Docker Compose -systemd → docker compose → openclaw container + R2 --> GW + R3 --> CP + R4 --> CF ``` -## Installation Flow - -1. **Tailscale Setup** (`tailscale.yml`) - - Add Tailscale repository - - Install Tailscale package - - Display connection instructions - -2. **User Creation** (`user.yml`) - - Create `openclaw` system user +## Stage 2 Runtime (Full/Lite) -3. **Docker Installation** (`docker.yml`) - - Install Docker CE + Compose V2 - - Add user to docker group - - Create `/etc/docker` directory +```mermaid +flowchart LR + IN[ingress] --> N[(NATS JetStream)] + RT[router] --> N + W[worker-*] --> N + B[broker] --> N + B --> P[(Postgres)] + A[control-api] --> P -4. **Firewall Setup** (`firewall.yml`) - - Install UFW - - Configure DOCKER-USER chain - - Configure Docker daemon (`/etc/docker/daemon.json`) - - Allow SSH (22/tcp) and Tailscale (41641/udp) + O[observability full mode]:::obs + O --> PR[prometheus] + O --> GR[grafana] + O --> UK[uptime-kuma] -5. **Node.js Installation** (`nodejs.yml`) - - Add NodeSource repository - - Install Node.js 22.x - - Install pnpm globally - -6. **OpenClaw Setup** (`openclaw.yml`) - - Create directories - - Generate configs from templates - - Build Docker image - - Start container via Compose - - Install systemd service - -## Key Design Decisions - -### Why UFW + DOCKER-USER? + classDef obs fill:#eef,stroke:#99c,stroke-width:1px; +``` -Docker manipulates iptables directly, bypassing UFW. The DOCKER-USER chain is evaluated before Docker's FORWARD chain, allowing us to block traffic before Docker sees it. +## Falencias Cubiertas por Diseño -### Why Localhost Binding? +| Falencia operativa | Respuesta en la suite | +|---|---| +| Instalación no repetible | Playbooks + defaults + inventarios por ambiente | +| Drift entre perfiles/agentes | Perfiles declarativos + reconciliación Ansible | +| Sin control de cola/estado | NATS + broker + control-api + PostgreSQL | +| Confirmaciones sin transición persistida | `control-api` actualiza estado y eventos en DB | +| Credenciales manuales por agente | `auth-sync` no interactivo por perfil/agente | +| Day-2 artesanal | Targets `make` estandarizados | -Defense in depth. Even if DOCKER-USER fails, localhost binding prevents external access. +## Seguridad Operativa -### Why Systemd Service? +- Secrets por perfil en `/etc/openclaw/secrets/*.env`. +- Servicios con aislamiento de usuario/perfil. +- Endpoints internos en loopback (publicación externa opcional por tunnel). +- Workers con UID/GID parametrizados para evitar supuestos rígidos de host. -- Auto-start on boot -- Clean lifecycle management -- Integration with system logs -- Dependency management (after Docker) +## Rutas Críticas -### Why Non-Root Container? +- Playbook enterprise: `playbooks/enterprise.yml` +- Roles: `roles/openclaw*` +- Control-plane source: `control-plane/` +- Inventarios: `inventories/*` +- Operación: `ops/*`, `Makefile` -Principle of least privilege. If container is compromised, attacker has limited privileges. +## Decisión de Compatibilidad -## Ansible Task Order +macOS bare-metal se considera fuera del modelo de ejecución seguro/soportado para esta suite. -``` -main.yml -├── tailscale.yml (VPN setup) -├── user.yml (create openclaw user) -├── docker.yml (install Docker, create /etc/docker) -├── firewall.yml (configure UFW + Docker daemon) -├── nodejs.yml (Node.js + pnpm) -└── openclaw.yml (container setup) -``` +## Relación con OpenClaw -Order matters: Docker must be installed before firewall configuration because: -1. `/etc/docker` directory must exist for `daemon.json` -2. Docker service must exist to be restarted after config changes +Esta suite es una capa de protocolo y operación sobre OpenClaw; no reemplaza el producto. diff --git a/docs/control-plane-stage2.md b/docs/control-plane-stage2.md index 6ec3b04..baae511 100644 --- a/docs/control-plane-stage2.md +++ b/docs/control-plane-stage2.md @@ -1,90 +1,60 @@ --- -title: Stage 2 Control Plane (NATS + NestJS) -summary: Full/lite queue orchestration package installable per profile +title: Stage 2 Control Plane (ClawOps Suite) +summary: Capa de orquestación de cola/estado para operación multi-agente en OpenClaw enterprise. --- # Stage 2 Control Plane -This repository now includes a reusable Stage 2 package for queue orchestration and telemetry. +## Contexto -## Modes +Stage 2 es la respuesta a una necesidad operativa: cuando hay múltiples agentes y perfiles, hace falta un plano de control explícito para enrutar, persistir, observar y decidir. -- `full` (`efra-core`): complete stack - - NATS JetStream - - PostgreSQL state store - - NestJS services: `ingress`, `router`, `broker`, `worker-main`, `worker-research`, `worker-browser-login`, `worker-coolify-ops`, `control-api` - - Observability: Prometheus + Grafana + Uptime Kuma -- `lite` (`andrea`): minimal direct worker path - - NATS JetStream - - PostgreSQL state store - - NestJS services: `ingress`, `router` (forced to `main`), `worker-main`, `broker`, `control-api` +## Modos -## Intent Routing +- `full`: + - NATS + Postgres + - ingress/router/broker/control-api + - workers múltiples + - observabilidad (Prometheus/Grafana/Uptime Kuma) +- `lite`: + - NATS + Postgres + - ingress/router-forced-main/worker-main/broker/control-api -Ingress receives Telegram/API messages and publishes `tasks.ingress`. -Router classifies intent and emits `tasks.agent.`. -Workers consume per-agent queues and emit `results.agent.`. -Broker persists outputs and can send Telegram replies. +## Flujo Operativo -Ingress also supports a direct Telegram command: +1. `ingress` publica tarea. +2. `router` decide destino. +3. `worker` ejecuta. +4. `broker` persiste y publica salida. +5. `control-api` consulta estados y aplica decisiones. -- `/agents` (or `/agents@`) to list available agents and intent mappings without queueing a task. - -## Contract - -Task envelope fields: -- `taskId` -- `profile` -- `source.channel/chatId/userId` -- `text` -- `intent` -- `targetAgent` -- `status` - -Result envelope fields: -- `taskId` -- `fromAgent` -- `status` -- `summary` -- `fullResponse` -- `needsConfirmation` - -## Deployment - -Enabled through `playbooks/enterprise.yml` with role `openclaw_control_plane`. - -Inventory variables (`inventories//group_vars/all.yml`): -- `openclaw_control_plane_enabled` -- `openclaw_control_plane_profiles` - -Secrets (`inventories//group_vars/vault.yml`): -- `vault_openclaw_cp_postgres_password_*` -- `vault_openclaw_cp_nats_password_*` -- `vault_telegram_bot_token_*` -- `vault_telegram_default_chat_id_*` - -## Operational Endpoints +## Endpoints Principales - Ingress: `http://127.0.0.1:/telegram/webhook` +- Simulación: `http://127.0.0.1:/ingress/simulate` - Control API: `http://127.0.0.1:/tasks` -- Queue stats: `http://127.0.0.1:/queues` -- Grafana (`full` only): `http://127.0.0.1:` -- Prometheus (`full` only): `http://127.0.0.1:` +- Cola: `http://127.0.0.1:/queues` + +## Endurecimientos Incluidos -You can publish these loopback endpoints through Cloudflare Tunnel subdomains by enabling -`openclaw_cloudflare_tunnel_*` variables in inventory (see `docs/cloudflare-tunnel.md`). +- Health probe con defaults coherentes por modo (`full`/`lite`). +- UID/GID de worker parametrizado (`OPENCLAW_UID/OPENCLAW_GID`). +- Confirm/reject con transición real de estado en DB. +- Reconciliación SQL de password con escaping seguro. -## Packaging for other profiles +## Integración con la Suite -To install this package on another profile, add one object to `openclaw_control_plane_profiles`. -No code changes are required, only profile variables and secrets. +Se habilita vía `openclaw_control_plane_enabled` y perfiles en inventario. -## Browser worker networking (full mode) +Despliegue recomendado: -For browser-driven flows (`browser-login`) the full stack template uses: +```bash +make install +make smoke +``` -- `network_mode: host` -- `shm_size: "1gb"` -- worker-local `NATS_URL` override to `127.0.0.1:` +## Referencias -This keeps queue consumption stable while allowing browser-related operations to reach host-local gateway/browser relay paths. +- [Operator Runbook](operator-runbook.md) +- [Operations Workflow](operations-workflow.md) +- [Architecture](architecture.md) diff --git a/docs/enterprise-deployment.md b/docs/enterprise-deployment.md index ad110d5..4e12978 100644 --- a/docs/enterprise-deployment.md +++ b/docs/enterprise-deployment.md @@ -1,143 +1,92 @@ --- -title: Enterprise Deployment -summary: Multi-environment, multi-profile OpenClaw deployment with Ansible +title: Enterprise Deployment (ClawOps Suite) +summary: Despliegue multi-ambiente y multi-perfil con OpenClaw + Stage 2 bajo un protocolo operativo único. --- # Enterprise Deployment -This repository now includes an enterprise deployment path with: +## Propósito -- Multi-environment inventories: `dev`, `staging`, `prod`, `research` -- Multi-profile gateway services per host -- Multi-agent profile config generation -- Multi-provider, multi-model defaults (OpenAI + Anthropic) -- Secret isolation via per-profile `EnvironmentFile` -- Stage 2 control-plane package (`full`/`lite`) with NATS + NestJS routing -- Optional Cloudflare Tunnel role for subdomain exposure of loopback services +Estandarizar despliegues enterprise donde un solo host o conjunto de hosts necesita: -## Files +- múltiples perfiles gateway, +- múltiples agentes por perfil, +- control de colas/estado, +- operación repetible y auditable. -- Playbook: `playbooks/enterprise.yml` -- Role: `roles/openclaw_enterprise` -- Stage 2 role: `roles/openclaw_control_plane` -- Cloudflare role: `roles/openclaw_cloudflare_tunnel` -- Stage 2 services source: `control-plane/` -- Inventories: `inventories//...` +## Qué se despliega -## Run +- `playbooks/enterprise.yml` +- `roles/openclaw` +- `roles/openclaw_enterprise` +- `roles/openclaw_control_plane` +- `roles/openclaw_cloudflare_tunnel` (opcional) -```bash -ansible-playbook -i inventories/dev/hosts.yml playbooks/enterprise.yml --ask-become-pass -ansible-playbook -i inventories/staging/hosts.yml playbooks/enterprise.yml --ask-become-pass -ansible-playbook -i inventories/prod/hosts.yml playbooks/enterprise.yml --ask-become-pass -ansible-playbook -i inventories/research/hosts.yml playbooks/enterprise.yml --ask-become-pass -``` +## Modelo Multi-Perfil -Or use the helper script: +Cada perfil define al menos: -```bash -./run-enterprise-playbook.sh dev -./run-enterprise-playbook.sh staging -./run-enterprise-playbook.sh prod -./run-enterprise-playbook.sh research -``` +- estado (`state_dir`, `config_path`, `workspace_root`), +- puerto gateway, +- secretos de entorno, +- lista de agentes, +- políticas de tools/modelos/sandbox. -### Resilient rollout behavior +## Modelo Stage 2 -`playbooks/enterprise.yml` is configured for resilient multi-node rollout: +Dos modos soportados: -- `serial: 1` (one node at a time) -- `ignore_unreachable: true` (continue when one node is down) -- `any_errors_fatal: false` -- `max_fail_percentage: 100` (do not abort the whole batch on partial failure) +- `full`: cola completa + observabilidad. +- `lite`: camino mínimo para ejecución directa. -You can override at runtime: +## Ejecución ```bash -./run-enterprise-playbook.sh dev -e openclaw_rollout_serial=2 -./run-enterprise-playbook.sh dev -e openclaw_ignore_unreachable=false -./run-enterprise-playbook.sh dev -e openclaw_max_fail_percentage=50 +ansible-playbook -i inventories/dev/hosts.yml playbooks/enterprise.yml --become ``` -## Secrets - -Store credentials in Ansible Vault and reference them from `inventories/*/group_vars/all.yml`: - -- `vault_openclaw_gateway_token_*` -- `vault_openai_api_key_*` (optional when using OAuth/browser auth) -- `vault_anthropic_api_key_*` (optional when using OAuth/browser auth) -- `vault_cloudflare_tunnel_id` (optional when Cloudflare role is enabled) -- `vault_cloudflare_tunnel_credentials_json` (optional when Cloudflare role is enabled) - -The role writes `/etc/openclaw/secrets/.env` with mode `0640`, owner `root`, group `openclaw`. - -### Initialize vault files - -Copy example files and encrypt: +o ```bash -cp inventories/dev/group_vars/vault.example.yml inventories/dev/group_vars/vault.yml -cp inventories/staging/group_vars/vault.example.yml inventories/staging/group_vars/vault.yml -cp inventories/prod/group_vars/vault.example.yml inventories/prod/group_vars/vault.yml -cp inventories/research/group_vars/vault.example.yml inventories/research/group_vars/vault.yml - -ansible-vault encrypt inventories/dev/group_vars/vault.yml -ansible-vault encrypt inventories/staging/group_vars/vault.yml -ansible-vault encrypt inventories/prod/group_vars/vault.yml -ansible-vault encrypt inventories/research/group_vars/vault.yml +./run-enterprise-playbook.sh dev ``` -`vault.yml` files are git-ignored by default. - -## Service model - -Each profile produces: - -- Config file: `/openclaw.json` -- Systemd unit: `openclaw-gateway-.service` -- Runtime isolation via per-profile `OPENCLAW_PROFILE`, `OPENCLAW_STATE_DIR`, `OPENCLAW_CONFIG_PATH`, `OPENCLAW_GATEWAY_PORT` - -## Android nodes in multi-node topology +## Comportamiento de Rollout -Android is a companion node (`role: node`) that connects to the gateway WebSocket and -must be paired on the gateway side. It does not run the gateway service and should not -be treated as an Ansible SSH target. +- `serial` configurable. +- tolerancia configurable a hosts no disponibles. +- ejecución progresiva para reducir riesgo de corte total. -Recommended pattern: +## Secrets y Gobernanza -- Keep Linux gateways in `openclaw_gateway`. -- Keep Android references in `openclaw_mobile_nodes` as inventory metadata. -- Operate pairing/state from a gateway host: +Variables sensibles deben residir en vault por ambiente: -```bash -openclaw nodes pending -openclaw nodes approve -openclaw nodes status -``` - -For tailnet-only connectivity, bind gateway profiles to tailnet interfaces and use -gateway tailnet IP/MagicDNS from Android. +- tokens gateway, +- credenciales NATS/Postgres, +- tokens Telegram, +- credenciales tunnel si aplica. -## Browser login agent (OpenClaw-managed browser) +La suite escribe archivos de entorno por perfil y separa secretos de configuración funcional. -Enterprise profiles can include a dedicated `browser-login` agent with: +## Qué Falencia Cubre Este Modo Enterprise -- `tools.profile: full` + `tools.allow: ["browser"]` (browser-only surface) -- `sandbox.mode: "off"` for reliable host login flows on strict sites -- profile-level browser default set to `openclaw` +1. Evita mezcla de estados entre perfiles. +2. Permite aislar rutas de agentes por contexto de negocio. +3. Habilita crecimiento incremental sin re-arquitectura manual. +4. Reduce dependencia de pasos ad-hoc en operadores individuales. -Operational flow (from gateway host): +## Integración con Operación Day-2 -```bash -openclaw browser --browser-profile openclaw start -openclaw browser --browser-profile openclaw open https://x.com -``` +Para operación continua usar: -Then sign in manually in the managed browser profile. Do not share credentials with the model. +- `make install` +- `make auth-sync` +- `make smoke` +- `make backup` +- `make purge CONFIRM=1` -## Notes +## Referencias -- Existing `playbook.yml` is unchanged for one-command installs. -- Use `playbooks/enterprise.yml` for multi-node production topology. -- Stage 2 queue orchestration and telemetry details: `docs/control-plane-stage2.md`. -- Cloudflare exposure details: `docs/cloudflare-tunnel.md`. +- [Operations Workflow](operations-workflow.md) +- [Stage 2 Control Plane](control-plane-stage2.md) +- [Operator Runbook](operator-runbook.md) diff --git a/docs/operations-workflow.md b/docs/operations-workflow.md index 6a96cc0..89b1ac3 100644 --- a/docs/operations-workflow.md +++ b/docs/operations-workflow.md @@ -1,92 +1,79 @@ --- -title: Operations Workflow (Backup, Purge, Install) -summary: Makefile-driven clean install/uninstall cycle for OpenClaw + Stage 2 control-plane +title: Operations Workflow (ClawOps Suite) +summary: Protocolo day-2 para operar OpenClaw enterprise de manera repetible. --- # Operations Workflow -This repository provides a Makefile interface over `ops/*.sh` scripts: +## Idea Central -- `make backup` -- `make purge CONFIRM=1` -- `make install` -- `make secrets-refactor` -- `make cloudflare` -- `make auth-sync` -- `make smoke` -- `make reinstall CONFIRM=1` +La suite define un protocolo simple: cada operación crítica debe tener un comando único y repetible. -## Why this split +Por eso `Makefile` expone comandos estables y `ops/*.sh` encapsula la implementación. -- `Makefile`: stable operator commands. -- `ops/*.sh`: implementation details, safe to extend. - -## Auth sync (Codex) - -Credential sync is now non-interactive and uses Codex auth files from the `efra` user. - -Use: +## Ciclo Canónico ```bash +make backup +make purge CONFIRM=1 +make install make auth-sync PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex -# legacy alias (same behavior) -make oauth-login PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex +make smoke ``` -Optional environment overrides (loaded from `/home/efra/.env` when present): +Para ejecución completa: -- `EFRA_CODEX_HOME` (default: `/home/efra/.codex`) -- `EFRA_CODEX_AUTH_DEFAULT` (default: `/home/efra/.codex/auth.json`) -- `EFRA_CODEX_AUTH_ANDREA` (default: `/home/efra/.codex/auth-andrea.json`) - -The sync process: +```bash +make reinstall CONFIRM=1 +``` -- copies auth files to `/home/openclaw/.codex/` -- writes `openai-codex` OAuth profiles into each target profile's `auth-profiles.json` -- sets profile default model to `openai-codex/gpt-5.3-codex` (configurable with `MODEL_REF`) +## Comandos y Rol Operativo -Runtime command environment still auto-loads `/etc/openclaw/secrets/.env` and exports: +- `make backup`: preserva estado operativo antes de cambios. +- `make purge`: limpia estado runtime para reinstalación controlada. +- `make install`: reconcilia enterprise + control-plane. +- `make auth-sync`: propaga credenciales Codex a perfiles/agentes. +- `make smoke`: valida salud + flujo cola end-to-end. -- `HOME=/home/openclaw` -- `OPENCLAW_BUNDLED_PLUGINS_DIR=/home/openclaw/.openclaw/bundled-extensions` +## Auth-Sync como Control de Deriva -## Manual secrets refactor +`auth-sync` existe para resolver una falencia operativa común: credenciales divergentes por agente/perfil. -Before a clean install, generate and review a manual migration file: +Estrategia: -```bash -make secrets-refactor ENV=dev LIMIT=zennook -``` +1. Fuente central en `/home/efra/.codex`. +2. Espejo en `/home/openclaw/.codex`. +3. Escritura determinista de `auth-profiles.json` por agente. +4. Alineación de modelo por perfil. -This creates: +## Validación de Secretos -- `inventories/dev/group_vars/vault.manual.refactor.yml` +`make install` ejecuta validación previa de secretos para bloquear despliegues incompletos. -Then: +Complemento: ```bash -# review and merge into vault.yml -./ops/validate-secrets.sh +make secrets-refactor ``` -`make install` now runs `./ops/validate-secrets.sh` first and aborts if required keys are -missing or still using placeholder values. +Genera base de migración manual para homogeneizar vault por ambiente. -Cloudflare reconcile (subdomain exposure only): +## Qué Falencias Cubre Este Workflow -```bash -make cloudflare ENV=dev LIMIT=zennook -``` +1. Cambios manuales no auditables. +2. Reinstalaciones inconsistentes. +3. Pérdida de estado por no hacer backup previo. +4. Despliegues "verdes" sin smoke real de cola. -## Defaults +## Defaults de Operación - `ENV=dev` - `INVENTORY=inventories/dev/hosts.yml` - `LIMIT=zennook` - `PROFILES="dev-main andrea"` -Override per command, for example: +## Referencias -```bash -make install ENV=staging LIMIT=fedora -``` +- [Operator Runbook](operator-runbook.md) +- [Enterprise Deployment](enterprise-deployment.md) +- [Installed Runtime Layout](architecture-installed-layout.md) From c5ca81a6fb1c5e4db5079b8506dca3bfb8499da5 Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 21:31:18 -0300 Subject: [PATCH 15/16] feat: seed agent identities and add automated install target --- AGENT_HANDOFF.md | 27 ++ Makefile | 15 +- docs/architecture-installed-layout.md | 234 ++++++++++++++++++ inventories/dev/group_vars/all.yml | 101 ++++++++ .../openclaw_control_plane/tasks/profile.yml | 4 +- roles/openclaw_enterprise/tasks/main.yml | 102 ++++++++ .../templates/workspace-agents.md.j2 | 36 +++ .../templates/workspace-identity.md.j2 | 7 + .../templates/workspace-soul.md.j2 | 22 ++ .../templates/workspace-user.md.j2 | 22 ++ 10 files changed, 567 insertions(+), 3 deletions(-) create mode 100644 AGENT_HANDOFF.md create mode 100644 docs/architecture-installed-layout.md create mode 100644 roles/openclaw_enterprise/templates/workspace-agents.md.j2 create mode 100644 roles/openclaw_enterprise/templates/workspace-identity.md.j2 create mode 100644 roles/openclaw_enterprise/templates/workspace-soul.md.j2 create mode 100644 roles/openclaw_enterprise/templates/workspace-user.md.j2 diff --git a/AGENT_HANDOFF.md b/AGENT_HANDOFF.md new file mode 100644 index 0000000..3617b11 --- /dev/null +++ b/AGENT_HANDOFF.md @@ -0,0 +1,27 @@ +# Codex Agent Handoff + +## Current State (2026-03-01) +- Enterprise install works end-to-end for `dev-main` + `andrea` profiles; control-plane Spins deployed via Docker Compose stacks under `/home/efra/openclaw-control-plane/` and managed via `make reinstall CONFIRM=1 ENV=dev LIMIT=zennook`. +- Postgres auth reconcilation now uses profile-specific admin credentials before flagging health check failures (`roles/openclaw_control_plane/tasks/profile.yml`). +- Smoke flow verifies queue + control API by checking `/tasks/{taskId}` until the task reaches a terminal status; `ops/smoke.sh` now retries and reports last status. +- OAuth login is manageable via `make oauth-login PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex`, automatically sources `/etc/openclaw/secrets/.env`, and lists auth profiles when done (`ops/oauth-login.sh`). +- Secrets refactor pipeline adds `ops/secrets-refactor.sh`, `ops/validate-secrets.sh`, and the Makefile target `make secrets-refactor` (docs updated accordingly). + +## Handoff Checklist +1. Confirm `/etc/openclaw/secrets/dev-main.env` and `/etc/openclaw/secrets/andrea.env` contain the required temp tokens; backups are stored under `backups/`. +2. Run `make reinstall CONFIRM=1 ENV=dev LIMIT=zennook` if the environment is dirty again; the playbooks already handle purge/install/smoke in one shot. +3. After OAuth login you still need to populate `auth-profiles.json` for each agent; run `openclaw --profile models auth list` to see active entries. +4. Verify Telegram tokens via `cat /etc/openclaw/secrets/dev-main.env` (mask the values in outputs). They are also referenced in `inventories/dev/group_vars/vault.yml` and each control-plane `.env` file. + +## Next Steps for Codex agent +- Finish the implementation plan for Ansible multi-agent deployment (already captured elsewhere, but double-check architecture docs and inventory). Copy actionable instructions into the reserved roadmap file. +- When writing PRs, include `@codex` mention, request a full review, and ask for architecture implementation plan per earlier requirements. +- Keep `ops/oauth-login.sh` and `ops/smoke.sh` in sync with any profile additions (e.g., add new profile names to `PROFILES` in `Makefile`). + +## Useful Commands +- `make secrets-refactor ENV=dev LIMIT=zennook` +- `make reinstall CONFIRM=1 ENV=dev LIMIT=zennook` +- `make oauth-login PROFILES="dev-main andrea" OAUTH_PROVIDER=openai-codex` +- `make smoke ENV=dev LIMIT=zennook` + +Keep notes in this file before handing off to another Codex agent; update the `next steps` section if you take new actions. diff --git a/Makefile b/Makefile index 2dddd66..31488f0 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ PROFILES ?= dev-main andrea OAUTH_PROVIDER ?= openai-codex MODEL_REF ?= openai-codex/gpt-5.3-codex -.PHONY: help backup purge install cloudflare auth-sync oauth-login smoke reinstall secrets-refactor +.PHONY: help backup purge install auto-install cloudflare auth-sync oauth-login smoke reinstall secrets-refactor help: @echo "OpenClaw Ops Targets" @@ -17,6 +17,7 @@ help: @echo " make backup Backup current OpenClaw + control-plane state" @echo " make purge CONFIRM=1 Purge deployed state and containers" @echo " make install Install/reconcile enterprise + control-plane" + @echo " make auto-install Automated install flow (auth-sync + install + smoke)" @echo " make secrets-refactor Build manual secrets migration file + validate vault" @echo " make cloudflare Reconcile Cloudflare tunnel/service only" @echo " make auth-sync Sync Codex creds from /home/efra/.codex to OpenClaw profiles" @@ -27,6 +28,7 @@ help: @echo "Variables:" @echo " ENV=$(ENV) INVENTORY=$(INVENTORY) LIMIT=$(LIMIT)" @echo " PROFILES='$(PROFILES)' OAUTH_PROVIDER=$(OAUTH_PROVIDER) MODEL_REF=$(MODEL_REF)" + @echo " AUTO_PURGE=0 AUTO_BACKUP=0 (used by auto-install)" backup: @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/backup.sh @@ -38,6 +40,17 @@ purge: install: @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/install.sh +auto-install: + @if [[ "$(AUTO_BACKUP)" == "1" ]]; then \ + $(MAKE) backup ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)"; \ + fi + @if [[ "$(AUTO_PURGE)" == "1" ]]; then \ + $(MAKE) purge CONFIRM=1 ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)"; \ + fi + @$(MAKE) auth-sync ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" PROFILES="$(PROFILES)" OAUTH_PROVIDER="$(OAUTH_PROVIDER)" MODEL_REF="$(MODEL_REF)" + @$(MAKE) install ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" + @$(MAKE) smoke ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" + secrets-refactor: @ENV="$(ENV)" INVENTORY="$(INVENTORY)" LIMIT="$(LIMIT)" ./ops/secrets-refactor.sh diff --git a/docs/architecture-installed-layout.md b/docs/architecture-installed-layout.md new file mode 100644 index 0000000..daff9f0 --- /dev/null +++ b/docs/architecture-installed-layout.md @@ -0,0 +1,234 @@ +--- +title: Installed Runtime Layout (dev-main + efra-core) +summary: Detailed installed system structure, runtime topology, and permissions for OpenClaw enterprise profile dev-main and control-plane profile efra-core. +--- + +# Installed Runtime Layout (dev-main + efra-core) + +This document describes how Ansible leaves the system installed for: + +- Gateway enterprise profile: `dev-main` +- Control-plane profile (full mode): `efra-core` + +## 1) Disk Layout + Permissions (Detailed) + +```mermaid +flowchart LR + subgraph SRC["Ansible source (/home/efra/openclaw-ansible)"] + INV["inventories/dev/group_vars/all.yml"] + ER["role: openclaw_enterprise"] + CR["role: openclaw_control_plane"] + AS["ops/auth-sync.sh (make auth-sync)"] + end + + subgraph ETC["System artifacts (/etc + systemd)"] + EROOT["/etc/openclaw\\n750 root:openclaw"] + ESEC["/etc/openclaw/secrets\\n750 root:openclaw"] + EDEV["/etc/openclaw/secrets/dev-main.env\\n640 root:openclaw"] + EAND["/etc/openclaw/secrets/andrea.env\\n640 root:openclaw"] + UDEV["/etc/systemd/system/openclaw-gateway-dev-main.service\\n644 root:root"] + UAND["/etc/systemd/system/openclaw-gateway-andrea.service\\n644 root:root"] + end + + subgraph GW["Gateway profile dev-main (/home/openclaw/.openclaw-dev-main)"] + GROOT["state dir\\n755 openclaw:openclaw"] + GCFG["openclaw.json\\n600 openclaw:openclaw"] + GAG["agents/\\n755 openclaw:openclaw"] + + AMAIN["agents/main/agent\\n700 openclaw:openclaw"] + ARES["agents/research/agent\\n700 openclaw:openclaw"] + ABRO["agents/browser-login/agent\\n700 openclaw:openclaw"] + ACOO["agents/coolify-ops/agent\\n700 openclaw:openclaw"] + + PMAIN["auth-profiles.json (main)\\n600 openclaw:openclaw"] + PRES["auth-profiles.json (research)\\n600 openclaw:openclaw"] + PBRO["auth-profiles.json (browser-login)\\n600 openclaw:openclaw"] + PCOO["auth-profiles.json (coolify-ops)\\n600 openclaw:openclaw"] + + WMAIN["workspace\\n755 openclaw:openclaw"] + WRES["workspace-research\\n755 openclaw:clock"] + WBRO["workspace-browser-login\\n755 openclaw:openclaw"] + WCOO["workspace-coolify-ops\\n755 openclaw:clock"] + + SMAIN["agents/main/sessions\\n755 openclaw:openclaw"] + GWPROC["systemd: openclaw-gateway-dev-main\\nUser=openclaw Group=openclaw\\nbind 127.0.0.1:19011"] + end + + subgraph COD["Codex creds path"] + ECOD["/home/efra/.codex\\n(source creds)"] + OCOD["/home/openclaw/.codex\\n700 openclaw:openclaw"] + OAUTH1["/home/openclaw/.codex/auth.json\\n600 openclaw:openclaw"] + OAUTH2["/home/openclaw/.codex/auth-andrea.json\\n600 openclaw:openclaw"] + end + + subgraph CPSRC["Control-plane build source"] + CPROOT["/opt/openclaw/control-plane/source\\n755 efra:efra"] + end + + subgraph CP["Control-plane profile efra-core (/home/efra/openclaw-control-plane/efra-core)"] + CPDIR["project dir\\n755 efra:efra"] + CPENV[".env\\n640 efra:efra"] + CPC["docker-compose.yml\\n644 efra:efra"] + CPP["prometheus/prometheus.yml\\n644 efra:efra"] + CPG["grafana/provisioning/datasources/datasource.yml\\n644 efra:efra"] + + DROOT["data/\\n755 efra:efra"] + DNATS["data/nats\\n755 root:root"] + DPG["data/postgres\\n700 uid70:root"] + DPROM["data/prometheus\\n755 root:root"] + DGRA["data/grafana\\n755 root:root"] + DUK["data/uptime-kuma\\n755 root:root"] + end + + subgraph RT["Docker runtime (project ocp-efra-core)"] + ING["ingress\\n127.0.0.1:30101->3000"] + API["control-api\\n127.0.0.1:39101->39090"] + NATS["nats\\n127.0.0.1:14222->4222"] + PG["postgres"] + ROU["router"] + BRK["broker"] + WM["worker-main"] + WR["worker-research"] + WBL["worker-browser-login\\n(host network + shm 1gb)"] + WCO["worker-coolify-ops"] + PRO["prometheus\\n127.0.0.1:39091->9090"] + GRA["grafana\\n127.0.0.1:31001->3000"] + UK["uptime-kuma\\n127.0.0.1:31081->3001"] + end + + INV --> ER + INV --> CR + INV --> AS + + ER --> EROOT + ER --> ESEC + ER --> EDEV + ER --> UDEV + ER --> GROOT + ER --> GCFG + UDEV --> GWPROC + ESEC --> EDEV + ESEC --> EAND + GROOT --> GAG + GAG --> AMAIN --> PMAIN + GAG --> ARES --> PRES + GAG --> ABRO --> PBRO + GAG --> ACOO --> PCOO + GROOT --> WMAIN + GROOT --> WRES + GROOT --> WBRO + GROOT --> WCOO + GAG --> SMAIN + GCFG --> GWPROC + + ECOD --> AS --> OCOD + AS --> OAUTH1 + AS --> OAUTH2 + AS --> PMAIN + AS --> PRES + AS --> PBRO + AS --> PCOO + AS --> GCFG + + CR --> CPROOT + CR --> CPDIR + CR --> CPENV + CR --> CPC + CR --> CPP + CR --> CPG + CR --> DROOT + DROOT --> DNATS + DROOT --> DPG + DROOT --> DPROM + DROOT --> DGRA + DROOT --> DUK + + CPC --> RT + CPENV --> RT + CPROOT --> RT + + RT --> ING + RT --> API + RT --> NATS + RT --> PG + RT --> ROU + RT --> BRK + RT --> WM + RT --> WR + RT --> WBL + RT --> WCO + RT --> PRO + RT --> GRA + RT --> UK + + WM --> GWPROC + WR --> GWPROC + WBL --> GWPROC + WCO --> GWPROC +``` + +## 2) Runtime Message Flow (full efra-core) + +```mermaid +sequenceDiagram + autonumber + participant TG as Telegram/API client + participant ING as ingress :30101 + participant NATS as NATS JetStream :14222 + participant RT as router + participant WK as worker- + participant OC as openclaw CLI (worker exec mode=openclaw) + participant GW as gateway dev-main :19011 + participant BR as broker + participant PG as Postgres + participant CA as control-api :39101 + + TG->>ING: POST /telegram/webhook or /ingress/simulate + ING->>NATS: publish tasks.ingress + RT->>NATS: consume tasks.ingress + RT->>NATS: publish tasks.agent. + WK->>NATS: consume tasks.agent. + WK->>OC: openclaw --profile dev-main agent --agent + OC->>GW: uses /etc/openclaw/secrets/dev-main.env + GW-->>OC: agent response + WK->>NATS: publish results.agent. + BR->>NATS: consume results.agent.* + BR->>PG: upsert task + events + CA->>PG: GET /tasks/ + BR-->>TG: optional Telegram sendMessage +``` + +## 3) Auth Sync Flow (non-interactive) + +```mermaid +flowchart TB + A["/home/efra/.codex/auth.json (+ auth-andrea.json)\\ncredential source"] --> B["make auth-sync\\nops/auth-sync.sh"] + B --> C["/home/openclaw/.codex\\n700 openclaw:openclaw"] + C --> D["auth.json / auth-andrea.json\\n600 openclaw:openclaw"] + + B --> E["/home/openclaw/.openclaw-dev-main/agents/main/agent/auth-profiles.json\\n600 openclaw:openclaw"] + B --> F[".../agents/research/agent/auth-profiles.json\\n600 openclaw:openclaw"] + B --> G[".../agents/browser-login/agent/auth-profiles.json\\n600 openclaw:openclaw"] + B --> H[".../agents/coolify-ops/agent/auth-profiles.json\\n600 openclaw:openclaw"] + + B --> I["openclaw.json (dev-main)\\nset model: openai-codex/gpt-5.3-codex"] +``` + +## 4) Quick Permission Matrix (critical paths) + +| Path | Mode | Owner:Group | Purpose | +|---|---:|---|---| +| `/etc/openclaw` | `750` | `root:openclaw` | OpenClaw system config root | +| `/etc/openclaw/secrets` | `750` | `root:openclaw` | per-profile env secrets | +| `/etc/openclaw/secrets/dev-main.env` | `640` | `root:openclaw` | gateway/profile runtime secrets | +| `/etc/systemd/system/openclaw-gateway-dev-main.service` | `644` | `root:root` | gateway unit | +| `/home/openclaw/.openclaw-dev-main` | `755` | `openclaw:openclaw` | profile state root | +| `/home/openclaw/.openclaw-dev-main/openclaw.json` | `600` | `openclaw:openclaw` | profile config | +| `/home/openclaw/.openclaw-dev-main/agents/*/agent` | `700` | `openclaw:openclaw` | per-agent private state | +| `/home/openclaw/.openclaw-dev-main/agents/*/agent/auth-profiles.json` | `600` | `openclaw:openclaw` | provider auth store | +| `/home/openclaw/.codex` | `700` | `openclaw:openclaw` | local codex credential mirror | +| `/home/openclaw/.codex/auth*.json` | `600` | `openclaw:openclaw` | codex oauth tokens | +| `/home/efra/openclaw-control-plane/efra-core/.env` | `640` | `efra:efra` | compose secrets/env | +| `/home/efra/openclaw-control-plane/efra-core/data/postgres` | `700` | `uid70:root` | postgres persistent volume | +| `/opt/openclaw/control-plane/source` | `755` | `efra:efra` | service build source synced by ansible | + diff --git a/inventories/dev/group_vars/all.yml b/inventories/dev/group_vars/all.yml index 0c6ceae..84655d2 100644 --- a/inventories/dev/group_vars/all.yml +++ b/inventories/dev/group_vars/all.yml @@ -21,15 +21,37 @@ openclaw_enterprise_profiles: tools_profile: coding sandbox_mode: non-main sandbox_scope: session + user_profile: + name: Efrain Garay + call: Efra + timezone: America/Santiago + notes: + - Prefer direct answers with practical execution details. + - Keep communication clear, human, and low on filler text. agents: - id: main default: true + name: Menicius Core + identity: + name: Menicius + theme: operations orchestrator + emoji: ":compass:" workspace: /home/openclaw/.openclaw-dev-main/workspace - id: research + name: Russell Research + identity: + name: Russell + theme: evidence analyst + emoji: ":microscope:" workspace: /home/openclaw/.openclaw-dev-main/workspace-research tools: profile: coding - id: browser-login + name: Andtera Browser + identity: + name: Andtera + theme: browser login specialist + emoji: ":key:" workspace: /home/openclaw/.openclaw-dev-main/workspace-browser-login sandbox: mode: "off" @@ -38,9 +60,63 @@ openclaw_enterprise_profiles: allow: - browser - id: coolify-ops + name: Forge Ops + identity: + name: Forge + theme: deployment reliability engineer + emoji: ":gear:" workspace: /home/openclaw/.openclaw-dev-main/workspace-coolify-ops tools: profile: coding + agent_personas: + - id: main + display_name: Menicius Core + identity_name: Menicius + creature: Systems navigator + role: Primary operator and execution lead + vibe: Direct, clear, and pragmatic + tone: Calm and action oriented + mission: Convert requests into executable outcomes and close loops with evidence. + responsibilities: + - Triage requests and define an execution path. + - Coordinate specialized agents when needed. + - Return clean human summaries with concrete next steps. + - id: research + display_name: Russell Research + identity_name: Russell + creature: Research analyst + role: Deep analysis and verification specialist + vibe: Methodical and factual + tone: Precise and neutral + mission: Produce high-confidence findings backed by explicit sources or repo evidence. + responsibilities: + - Investigate docs, code, and references before conclusions. + - Separate facts, assumptions, and open questions. + - Highlight risks, regressions, and validation gaps. + - id: browser-login + display_name: Andtera Browser + identity_name: Andtera + creature: Browser workflow operator + role: Human-in-the-loop auth and web flow specialist + vibe: Careful and procedural + tone: Simple and step-by-step + mission: Complete browser-based authentication and guided web tasks safely. + responsibilities: + - Handle login, verification, and manual browser checkpoints. + - Report exact steps, status, and blockers. + - Avoid broad changes outside browser scope. + - id: coolify-ops + display_name: Forge Ops + identity_name: Forge + creature: Infrastructure engineer + role: Coolify and deployment reliability specialist + vibe: Operational and resilient + tone: Direct with measurable outcomes + mission: Keep deployments stable and recover quickly from service failures. + responsibilities: + - Validate deployment health and runtime services. + - Apply targeted fixes with rollback-aware steps. + - Document evidence from logs, checks, and post-fix validation. bindings: [] env: OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_dev_main | default('replace-me-dev-main-gateway-token') }}" @@ -59,10 +135,35 @@ openclaw_enterprise_profiles: tools_profile: messaging sandbox_mode: non-main sandbox_scope: session + user_profile: + name: Efrain Garay + call: Efra + timezone: America/Santiago + notes: + - Keep tone warm and simple for personal assistant interactions. + - Stay concise unless more detail is requested. agents: - id: main default: true + name: Andrea Concierge + identity: + name: Andrea + theme: personal concierge assistant + emoji: ":sparkles:" workspace: /home/openclaw/.openclaw-andrea/workspace + agent_personas: + - id: main + display_name: Andrea Concierge + identity_name: Andrea + creature: Personal assistant + role: Messaging-first daily support + vibe: Friendly, clear, and calm + tone: Human and concise + mission: Help with everyday requests quickly while keeping responses clean and natural. + responsibilities: + - Prioritize direct user requests and lightweight follow-through. + - Keep instructions clear and easy to execute. + - Use Markdown code blocks whenever sharing technical snippets. bindings: [] env: OPENCLAW_GATEWAY_TOKEN: "{{ vault_openclaw_gateway_token_andrea | default('replace-me-andrea-gateway-token') }}" diff --git a/roles/openclaw_control_plane/tasks/profile.yml b/roles/openclaw_control_plane/tasks/profile.yml index 8073358..704ed18 100644 --- a/roles/openclaw_control_plane/tasks/profile.yml +++ b/roles/openclaw_control_plane/tasks/profile.yml @@ -106,7 +106,7 @@ -U {{ profile.postgres_admin_user | default(profile.postgres_user | default('openclaw')) }} \ -d {{ profile.postgres_db | default('openclaw_control') }} \ -v ON_ERROR_STOP=1 <<'SQL' - DO \$\$ + DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = '{{ profile.postgres_user | default('openclaw') }}') THEN CREATE ROLE {{ profile.postgres_user | default('openclaw') }} LOGIN PASSWORD '{{ profile.postgres_password | replace("'", "''") }}'; @@ -114,7 +114,7 @@ ALTER ROLE {{ profile.postgres_user | default('openclaw') }} WITH LOGIN PASSWORD '{{ profile.postgres_password | replace("'", "''") }}'; END IF; END - \$\$; + $$; SQL register: profile_postgres_reconcile changed_when: false diff --git a/roles/openclaw_enterprise/tasks/main.yml b/roles/openclaw_enterprise/tasks/main.yml index d1595e6..0a844a6 100644 --- a/roles/openclaw_enterprise/tasks/main.yml +++ b/roles/openclaw_enterprise/tasks/main.yml @@ -81,6 +81,108 @@ loop_control: loop_var: profile +- name: Ensure per-agent workspace directories exist + ansible.builtin.file: + path: "{{ agent_workspace }}" + state: directory + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0755' + vars: + agent_workspace: >- + {{ + item.1.workspace + | default( + item.0.workspace_root + | default((item.0.state_dir | default(openclaw_home ~ '/.openclaw-' ~ item.0.name)) ~ '/workspace') + ) + }} + loop: "{{ openclaw_enterprise_profiles | subelements('agents', skip_missing=True) }}" + +- name: Seed AGENTS.md per agent workspace + ansible.builtin.template: + src: workspace-agents.md.j2 + dest: "{{ agent_workspace }}/AGENTS.md" + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0644' + vars: + profile: "{{ item.0 }}" + agent: "{{ item.1 }}" + agent_workspace: >- + {{ + item.1.workspace + | default( + item.0.workspace_root + | default((item.0.state_dir | default(openclaw_home ~ '/.openclaw-' ~ item.0.name)) ~ '/workspace') + ) + }} + agent_persona: "{{ (item.0.agent_personas | default([]) | selectattr('id', 'equalto', item.1.id) | list | first | default({})) }}" + loop: "{{ openclaw_enterprise_profiles | subelements('agents', skip_missing=True) }}" + +- name: Seed SOUL.md per agent workspace + ansible.builtin.template: + src: workspace-soul.md.j2 + dest: "{{ agent_workspace }}/SOUL.md" + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0644' + vars: + profile: "{{ item.0 }}" + agent: "{{ item.1 }}" + agent_workspace: >- + {{ + item.1.workspace + | default( + item.0.workspace_root + | default((item.0.state_dir | default(openclaw_home ~ '/.openclaw-' ~ item.0.name)) ~ '/workspace') + ) + }} + agent_persona: "{{ (item.0.agent_personas | default([]) | selectattr('id', 'equalto', item.1.id) | list | first | default({})) }}" + loop: "{{ openclaw_enterprise_profiles | subelements('agents', skip_missing=True) }}" + +- name: Seed IDENTITY.md per agent workspace + ansible.builtin.template: + src: workspace-identity.md.j2 + dest: "{{ agent_workspace }}/IDENTITY.md" + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0644' + vars: + profile: "{{ item.0 }}" + agent: "{{ item.1 }}" + agent_workspace: >- + {{ + item.1.workspace + | default( + item.0.workspace_root + | default((item.0.state_dir | default(openclaw_home ~ '/.openclaw-' ~ item.0.name)) ~ '/workspace') + ) + }} + agent_persona: "{{ (item.0.agent_personas | default([]) | selectattr('id', 'equalto', item.1.id) | list | first | default({})) }}" + loop: "{{ openclaw_enterprise_profiles | subelements('agents', skip_missing=True) }}" + +- name: Seed USER.md per agent workspace + ansible.builtin.template: + src: workspace-user.md.j2 + dest: "{{ agent_workspace }}/USER.md" + owner: "{{ openclaw_user }}" + group: "{{ openclaw_user }}" + mode: '0644' + vars: + profile: "{{ item.0 }}" + agent: "{{ item.1 }}" + agent_workspace: >- + {{ + item.1.workspace + | default( + item.0.workspace_root + | default((item.0.state_dir | default(openclaw_home ~ '/.openclaw-' ~ item.0.name)) ~ '/workspace') + ) + }} + agent_persona: "{{ (item.0.agent_personas | default([]) | selectattr('id', 'equalto', item.1.id) | list | first | default({})) }}" + loop: "{{ openclaw_enterprise_profiles | subelements('agents', skip_missing=True) }}" + - name: Validate required secret keys in profile env ansible.builtin.assert: that: diff --git a/roles/openclaw_enterprise/templates/workspace-agents.md.j2 b/roles/openclaw_enterprise/templates/workspace-agents.md.j2 new file mode 100644 index 0000000..b5c68db --- /dev/null +++ b/roles/openclaw_enterprise/templates/workspace-agents.md.j2 @@ -0,0 +1,36 @@ +# AGENTS.md - {{ agent_persona.display_name | default(agent.name | default(agent.id)) }} + +Profile: `{{ profile.name }}` +Agent id: `{{ agent.id }}` + +## Mission +{{ agent_persona.mission | default('Deliver reliable outcomes for this agent scope, with clear execution and traceability.') }} + +## Core tasks +{% set responsibilities = agent_persona.responsibilities | default([]) %} +{% if responsibilities | length > 0 %} +{% for item in responsibilities %} +- {{ item }} +{% endfor %} +{% else %} +- Understand the request context before acting. +- Execute only the work that belongs to this agent scope. +- Return concise outcomes with next actionable step. +{% endif %} + +## Response style +- Keep responses clean and human. +- Avoid filler or repetitive phrasing. +- When sharing code, commands, config, or payloads, always use Markdown fenced blocks. +- Explain technical decisions briefly when they affect risk, cost, or behavior. + +## Operating protocol +- Investigate local context first (files, config, logs) before asking for clarification. +- Ask for confirmation before destructive actions or external/public actions. +- Prefer reversible operations when possible. +- Report what changed and how it was validated. + +## Session baseline +- Read `SOUL.md` for tone and boundaries. +- Read `USER.md` for user preferences. +- Keep this file updated when scope or responsibilities change. diff --git a/roles/openclaw_enterprise/templates/workspace-identity.md.j2 b/roles/openclaw_enterprise/templates/workspace-identity.md.j2 new file mode 100644 index 0000000..6e49a8f --- /dev/null +++ b/roles/openclaw_enterprise/templates/workspace-identity.md.j2 @@ -0,0 +1,7 @@ +# IDENTITY.md - {{ agent_persona.display_name | default(agent.name | default(agent.id)) }} + +- Name: {{ agent_persona.identity_name | default(agent.identity.name | default(agent.name | default(agent.id))) }} +- Creature: {{ agent_persona.creature | default('Digital operator') }} +- Vibe: {{ agent_persona.vibe | default(agent.identity.theme | default('Focused and pragmatic')) }} +- Emoji: {{ agent_persona.emoji | default(agent.identity.emoji | default(':openclaw:')) }} +- Avatar: {{ agent_persona.avatar | default(agent.identity.avatar | default('')) }} diff --git a/roles/openclaw_enterprise/templates/workspace-soul.md.j2 b/roles/openclaw_enterprise/templates/workspace-soul.md.j2 new file mode 100644 index 0000000..5746991 --- /dev/null +++ b/roles/openclaw_enterprise/templates/workspace-soul.md.j2 @@ -0,0 +1,22 @@ +# SOUL.md - {{ agent_persona.display_name | default(agent.name | default(agent.id)) }} + +## Identity core +- Role: {{ agent_persona.role | default('Specialized OpenClaw operator') }} +- Vibe: {{ agent_persona.vibe | default('Direct, calm, and practical') }} +- Tone: {{ agent_persona.tone | default('Professional, concise, and respectful') }} + +## Boundaries +- Never expose private data from unrelated sessions or channels. +- Do not simulate completion when work is still pending. +- Do not run destructive commands without explicit confirmation. +- Do not perform external/public actions unless clearly requested. + +## Working principles +- Clarity first: state assumptions and constraints early. +- Precision over verbosity: enough detail to execute safely. +- Ownership: close loops, validate outcomes, report evidence. +- Reliability: prefer deterministic steps over guesswork. + +## Behavioral requirement +- Output must feel human and clear. +- Any technical snippet must be formatted as Markdown code block. diff --git a/roles/openclaw_enterprise/templates/workspace-user.md.j2 b/roles/openclaw_enterprise/templates/workspace-user.md.j2 new file mode 100644 index 0000000..40ef136 --- /dev/null +++ b/roles/openclaw_enterprise/templates/workspace-user.md.j2 @@ -0,0 +1,22 @@ +# USER.md - About Your Human + +- Name: {{ profile.user_profile.name | default('Efrain') }} +- What to call them: {{ profile.user_profile.call | default('Efra') }} +- Pronouns: {{ profile.user_profile.pronouns | default('') }} +- Timezone: {{ profile.user_profile.timezone | default('America/Santiago') }} + +## Notes +{% set notes = profile.user_profile.notes | default([]) %} +{% if notes | length > 0 %} +{% for note in notes %} +- {{ note }} +{% endfor %} +{% else %} +- Operator expects practical outcomes and minimal fluff. +- Keep communication clear and actionable. +{% endif %} + +## Response contract +- Human-readable answers. +- Use Markdown fenced code blocks for all code/commands/config snippets. +- Separate findings, actions, and next steps clearly. From 71d4bfcfa03b29fdb0693955c6b7f152278a4b27 Mon Sep 17 00:00:00 2001 From: Efrain Garay Date: Sun, 1 Mar 2026 22:40:53 -0300 Subject: [PATCH 16/16] fix: harden runtime permissions across auth-sync and workers --- ops/auth-sync.sh | 9 +++++++++ roles/openclaw/tasks/openclaw.yml | 4 ++-- roles/openclaw_control_plane/tasks/main.yml | 16 ++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/ops/auth-sync.sh b/ops/auth-sync.sh index 67f2822..28028b6 100755 --- a/ops/auth-sync.sh +++ b/ops/auth-sync.sh @@ -92,6 +92,14 @@ function ensureDir(dirPath, uid, gid) { fs.chownSync(dirPath, uid, gid); } +function ensureProfileSkeleton(profileDir, uid, gid) { + // Recursive mkdir can leave intermediate directories owned by root. + // Ensure profile roots are writable by openclaw before model configuration. + ensureDir(profileDir, uid, gid); + ensureDir(path.join(profileDir, "agents"), uid, gid); + ensureDir(path.join(profileDir, "agents", "main"), uid, gid); +} + function loadStore(storePath) { try { const parsed = JSON.parse(fs.readFileSync(storePath, "utf8")); @@ -186,6 +194,7 @@ let stores = 0; for (const profile of profiles) { const profileDir = resolveProfileDir(profile); const selected = profile === "andrea" ? andreaCred : defaultCred; + ensureProfileSkeleton(profileDir, uid, gid); const agentDirs = collectAgentDirs(profileDir); for (const agentDir of agentDirs) { diff --git a/roles/openclaw/tasks/openclaw.yml b/roles/openclaw/tasks/openclaw.yml index 4f1a361..2cec52b 100644 --- a/roles/openclaw/tasks/openclaw.yml +++ b/roles/openclaw/tasks/openclaw.yml @@ -44,14 +44,14 @@ - "{{ openclaw_home }}/.local/share/pnpm/store" - "{{ openclaw_home }}/.local/bin" -- name: Ensure pnpm directories have correct ownership +- name: Ensure pnpm directory tree ownership is correct ansible.builtin.file: path: "{{ openclaw_home }}/.local/share/pnpm" state: directory owner: "{{ openclaw_user }}" group: "{{ openclaw_user }}" recurse: true - mode: '0755' + when: not (ci_test | default(false) | bool) - name: Configure pnpm for openclaw user ansible.builtin.shell: diff --git a/roles/openclaw_control_plane/tasks/main.yml b/roles/openclaw_control_plane/tasks/main.yml index 32ee396..5ce5b83 100644 --- a/roles/openclaw_control_plane/tasks/main.yml +++ b/roles/openclaw_control_plane/tasks/main.yml @@ -41,6 +41,22 @@ loop_var: profile no_log: true +- name: Resolve runtime UID/GID for openclaw worker user + ansible.builtin.getent: + database: passwd + key: "{{ openclaw_user | default('openclaw') }}" + +- name: Set effective control-plane worker UID/GID from system account + ansible.builtin.set_fact: + openclaw_control_plane_worker_uid: >- + {{ + ansible_facts.getent_passwd[openclaw_user | default('openclaw')][1] + }} + openclaw_control_plane_worker_gid: >- + {{ + ansible_facts.getent_passwd[openclaw_user | default('openclaw')][2] + }} + - name: Ensure control-plane runtime root exists ansible.builtin.file: path: "{{ openclaw_control_plane_runtime_root }}"