diff --git a/.github/workflows/clawdash-image.yml b/.github/workflows/clawdash-image.yml new file mode 100644 index 0000000..b882113 --- /dev/null +++ b/.github/workflows/clawdash-image.yml @@ -0,0 +1,58 @@ +name: clawdash Image + +on: + push: + branches: + - master + tags: + - "v*" + pull_request: + paths: + - "cmd/clawdash/**" + - "internal/clawdash/**" + - "dockerfiles/clawdash/**" + - "go.mod" + - "go.sum" + - ".github/workflows/clawdash-image.yml" + workflow_dispatch: + +permissions: + contents: read + packages: write + +jobs: + build-and-publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: docker/setup-qemu-action@v3 + + - uses: docker/setup-buildx-action@v3 + + - uses: docker/login-action@v3 + if: github.event_name != 'pull_request' + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - id: meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/${{ github.repository_owner }}/clawdash + tags: | + type=raw,value=latest,enable={{is_default_branch}} + type=ref,event=tag + type=sha,format=short + + - uses: docker/build-push-action@v6 + with: + context: . + file: dockerfiles/clawdash/Dockerfile + platforms: linux/amd64,linux/arm64 + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/README.md b/README.md index c3b7072..0c2be82 100644 --- a/README.md +++ b/README.md @@ -35,11 +35,13 @@ claw build -t quickstart-assistant ./agents/assistant claw up -f claw-pod.yml -d # Verify -claw ps -f claw-pod.yml # assistant + cllama-passthrough both running +claw ps -f claw-pod.yml # assistant + cllama both running claw health -f claw-pod.yml # both healthy ``` -Open **http://localhost:8081** — the cllama governance proxy dashboard. Watch every LLM call in real time: which agent, which model, token counts, cost. +Open **http://localhost:8181** — the cllama governance proxy dashboard. Watch every LLM call in real time: which agent, which model, token counts, cost. + +Open **http://localhost:8082** — the Clawdapus Dash fleet dashboard. View live service health, topology wiring, and per-service drill-down status. Message `@quickstart-bot` in your Discord server. The bot responds through the proxy — it has no direct API access. The dashboard updates live. @@ -59,6 +61,23 @@ claw up -d claw agent add researcher ``` +## Dashboard Screenshots + +Fleet view with integrated costs status: + +![Clawdapus Dash Fleet](docs/screenshots/clawdash-fleet-costs.png) + +If a cllama build does not emit `GET /costs/api`, Clawdapus Dash surfaces an explicit "cost emission not available yet" state instead of linking to a dead page. +API data is authoritative; log-derived cost estimation is opt-in via `CLAWDASH_COST_LOG_FALLBACK=1`. + +Topology view: + +![Clawdapus Dash Topology](docs/screenshots/clawdash-topology.png) + +Service detail view: + +![Clawdapus Dash Detail](docs/screenshots/clawdash-detail.png) + --- ## Install @@ -248,9 +267,9 @@ When a reasoning model tries to govern itself, the guardrails are part of the sa - **Identity resolution:** Single proxy serves an entire pod. Bearer tokens resolve which agent is calling. - **Cost accounting:** Extracts token usage from every response, multiplies by pricing table, tracks per agent/provider/model. - **Audit logging:** Structured JSON on stdout — timestamp, agent, model, latency, tokens, cost, intervention reason. -- **Operator dashboard:** Real-time web UI at port 8081 — agent activity, provider status, cost breakdown. +- **Operator dashboard:** Real-time web UI at host port 8181 by default (container `:8081`) — agent activity, provider status, cost breakdown. -The reference implementation is [`cllama-passthrough`](https://github.com/mostlydev/cllama-passthrough) — a zero-dependency Go binary that implements the transport layer (identity, routing, cost tracking). Future proxy types (`cllama-policy`) will add bidirectional interception: evaluating outbound prompts and amending inbound responses against the agent's behavioral contract. +The reference implementation is [`cllama`](https://github.com/mostlydev/cllama) — a zero-dependency Go binary that implements the transport layer (identity, routing, cost tracking). Future proxy types (`cllama-policy`) will add bidirectional interception: evaluating outbound prompts and amending inbound responses against the agent's behavioral contract. See the [cllama specification](./docs/CLLAMA_SPEC.md) for the full standard. diff --git a/cmd/claw/compose_manifest.go b/cmd/claw/compose_manifest.go new file mode 100644 index 0000000..ae744f7 --- /dev/null +++ b/cmd/claw/compose_manifest.go @@ -0,0 +1,132 @@ +package main + +import ( + "encoding/json" + "fmt" + "path/filepath" + "sort" + "strings" + + "github.com/mostlydev/clawdapus/internal/clawdash" + "github.com/mostlydev/clawdapus/internal/cllama" + "github.com/mostlydev/clawdapus/internal/driver" + "github.com/mostlydev/clawdapus/internal/pod" +) + +func writePodManifest(runtimeDir string, p *pod.Pod, resolved map[string]*driver.ResolvedClaw, proxies []pod.CllamaProxyConfig) (string, error) { + manifest := buildPodManifest(p, resolved, proxies) + data, err := json.MarshalIndent(manifest, "", " ") + if err != nil { + return "", fmt.Errorf("encode pod manifest: %w", err) + } + + path := filepath.Join(runtimeDir, "pod-manifest.json") + if err := writeRuntimeFile(path, append(data, '\n'), 0644); err != nil { + return "", fmt.Errorf("write pod manifest %q: %w", path, err) + } + return path, nil +} + +func buildPodManifest(p *pod.Pod, resolved map[string]*driver.ResolvedClaw, proxies []pod.CllamaProxyConfig) *clawdash.PodManifest { + out := &clawdash.PodManifest{ + PodName: p.Name, + Services: make(map[string]clawdash.ServiceManifest, len(p.Services)), + } + + names := make([]string, 0, len(p.Services)) + for name := range p.Services { + names = append(names, name) + } + sort.Strings(names) + + for _, name := range names { + svc := p.Services[name] + manifest := clawdash.ServiceManifest{ + ImageRef: svc.Image, + Count: 1, + } + if svc.Claw != nil && svc.Claw.Count > 0 { + manifest.Count = svc.Claw.Count + } + + if rc, ok := resolved[name]; ok && rc != nil { + manifest.ClawType = rc.ClawType + manifest.Agent = rc.Agent + manifest.Models = cloneStringMap(rc.Models) + manifest.Handles = rc.Handles + manifest.PeerHandles = rc.PeerHandles + manifest.Surfaces = toSurfaceManifest(rc.Surfaces) + manifest.Skills = resolvedSkillNames(rc.Skills) + manifest.Invocations = append([]driver.Invocation(nil), rc.Invocations...) + manifest.Cllama = append([]string(nil), rc.Cllama...) + if rc.Count > 0 { + manifest.Count = rc.Count + } + } else if svc.Claw != nil { + manifest.Handles = svc.Claw.Handles + manifest.Surfaces = toSurfaceManifest(svc.Claw.Surfaces) + manifest.Cllama = append([]string(nil), svc.Claw.Cllama...) + } + + out.Services[name] = manifest + } + + if len(proxies) > 0 { + out.Proxies = make([]clawdash.ProxyManifest, 0, len(proxies)) + for _, proxy := range proxies { + out.Proxies = append(out.Proxies, clawdash.ProxyManifest{ + ProxyType: proxy.ProxyType, + ServiceName: cllama.ProxyServiceName(proxy.ProxyType), + Image: proxy.Image, + }) + } + sort.Slice(out.Proxies, func(i, j int) bool { + return out.Proxies[i].ServiceName < out.Proxies[j].ServiceName + }) + } + + return out +} + +func toSurfaceManifest(in []driver.ResolvedSurface) []clawdash.SurfaceManifest { + if len(in) == 0 { + return nil + } + out := make([]clawdash.SurfaceManifest, 0, len(in)) + for _, s := range in { + out = append(out, clawdash.SurfaceManifest{ + Scheme: s.Scheme, + Target: s.Target, + AccessMode: s.AccessMode, + Ports: append([]string(nil), s.Ports...), + ChannelConfig: s.ChannelConfig, + }) + } + return out +} + +func resolvedSkillNames(in []driver.ResolvedSkill) []string { + if len(in) == 0 { + return nil + } + out := make([]string, 0, len(in)) + for _, sk := range in { + name := strings.TrimSpace(sk.Name) + if name == "" { + continue + } + out = append(out, name) + } + return out +} + +func cloneStringMap(in map[string]string) map[string]string { + if len(in) == 0 { + return nil + } + out := make(map[string]string, len(in)) + for k, v := range in { + out[k] = v + } + return out +} diff --git a/cmd/claw/compose_manifest_test.go b/cmd/claw/compose_manifest_test.go new file mode 100644 index 0000000..493dc55 --- /dev/null +++ b/cmd/claw/compose_manifest_test.go @@ -0,0 +1,128 @@ +package main + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/mostlydev/clawdapus/internal/driver" + "github.com/mostlydev/clawdapus/internal/pod" +) + +func TestBuildPodManifestIncludesResolvedState(t *testing.T) { + p := &pod.Pod{ + Name: "fleet", + Services: map[string]*pod.Service{ + "bot": { + Image: "bot:latest", + Claw: &pod.ClawBlock{Count: 2}, + }, + "redis": { + Image: "redis:7", + }, + }, + } + + resolved := map[string]*driver.ResolvedClaw{ + "bot": { + ServiceName: "bot", + ImageRef: "bot:latest", + ClawType: "openclaw", + Agent: "AGENTS.md", + Models: map[string]string{ + "primary": "anthropic/claude-sonnet-4-20250514", + }, + Count: 2, + Handles: map[string]*driver.HandleInfo{ + "discord": {ID: "123", Username: "fleet-bot"}, + }, + PeerHandles: map[string]map[string]*driver.HandleInfo{ + "analyst": { + "discord": {ID: "456", Username: "analyst-bot"}, + }, + }, + Surfaces: []driver.ResolvedSurface{ + {Scheme: "channel", Target: "discord"}, + {Scheme: "service", Target: "redis", Ports: []string{"6379"}}, + }, + Skills: []driver.ResolvedSkill{ + {Name: "risk-limits.md", HostPath: "/host/risk-limits.md"}, + }, + Invocations: []driver.Invocation{ + {Schedule: "0 * * * *", Message: "status pulse", Name: "status", To: "123"}, + }, + Cllama: []string{"passthrough"}, + }, + } + proxies := []pod.CllamaProxyConfig{ + {ProxyType: "passthrough", Image: "ghcr.io/mostlydev/cllama:latest"}, + } + + got := buildPodManifest(p, resolved, proxies) + if got.PodName != "fleet" { + t.Fatalf("expected podName=fleet, got %q", got.PodName) + } + if len(got.Services) != 2 { + t.Fatalf("expected 2 services, got %d", len(got.Services)) + } + + botSvc := got.Services["bot"] + if botSvc.ClawType != "openclaw" { + t.Fatalf("expected claw type openclaw, got %q", botSvc.ClawType) + } + if botSvc.Count != 2 { + t.Fatalf("expected count 2, got %d", botSvc.Count) + } + if len(botSvc.Skills) != 1 || botSvc.Skills[0] != "risk-limits.md" { + t.Fatalf("expected skill name-only serialization, got %v", botSvc.Skills) + } + if len(botSvc.Cllama) != 1 || botSvc.Cllama[0] != "passthrough" { + t.Fatalf("expected cllama passthrough, got %v", botSvc.Cllama) + } + + redisSvc := got.Services["redis"] + if redisSvc.ClawType != "" { + t.Fatalf("expected non-claw service clawType empty, got %q", redisSvc.ClawType) + } + if redisSvc.Count != 1 { + t.Fatalf("expected non-claw count 1, got %d", redisSvc.Count) + } + + if len(got.Proxies) != 1 { + t.Fatalf("expected 1 proxy, got %d", len(got.Proxies)) + } + if got.Proxies[0].ServiceName != "cllama" { + t.Fatalf("expected proxy service cllama, got %q", got.Proxies[0].ServiceName) + } +} + +func TestWritePodManifestWritesJSONFile(t *testing.T) { + dir := t.TempDir() + p := &pod.Pod{ + Name: "test-pod", + Services: map[string]*pod.Service{ + "bot": {Image: "bot:latest"}, + }, + } + + path, err := writePodManifest(dir, p, nil, nil) + if err != nil { + t.Fatalf("writePodManifest returned error: %v", err) + } + if path != filepath.Join(dir, "pod-manifest.json") { + t.Fatalf("unexpected manifest path %q", path) + } + + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read manifest: %v", err) + } + var decoded map[string]interface{} + if err := json.Unmarshal(raw, &decoded); err != nil { + t.Fatalf("manifest is not valid json: %v", err) + } + if decoded["podName"] != "test-pod" { + t.Fatalf("expected podName=test-pod, got %v", decoded["podName"]) + } +} diff --git a/cmd/claw/compose_up.go b/cmd/claw/compose_up.go index 0378d95..a9d9e18 100644 --- a/cmd/claw/compose_up.go +++ b/cmd/claw/compose_up.go @@ -248,6 +248,7 @@ func runComposeUp(podFile string) error { cllamaEnabled, cllamaAgents := detectCllama(resolvedClaws) proxies := make([]pod.CllamaProxyConfig, 0) + cllamaDashboardPort := envOrDefault("CLLAMA_UI_PORT", "8181") if cllamaEnabled { proxyTypes := collectProxyTypes(resolvedClaws) if len(proxyTypes) > 1 { @@ -387,9 +388,10 @@ func runComposeUp(podFile string) error { for _, proxyType := range proxyTypes { proxies = append(proxies, pod.CllamaProxyConfig{ ProxyType: proxyType, - Image: fmt.Sprintf("ghcr.io/mostlydev/cllama-%s:latest", proxyType), + Image: cllama.ProxyImageRef(proxyType), ContextHostDir: filepath.Join(runtimeDir, "context"), AuthHostDir: authDir, + DashboardPort: cllamaDashboardPort, Environment: proxyEnv, PodName: p.Name, }) @@ -398,6 +400,21 @@ func runComposeUp(podFile string) error { strings.Join(proxyTypes, ", "), strings.Join(cllamaAgents, ", ")) } + manifestPath, err := writePodManifest(runtimeDir, p, resolvedClaws, proxies) + if err != nil { + return err + } + fmt.Printf("[claw] wrote %s\n", manifestPath) + + p.Clawdash = &pod.ClawdashConfig{ + Image: "ghcr.io/mostlydev/clawdash:latest", + Addr: envOrDefault("CLAWDASH_ADDR", ":8082"), + ManifestHostPath: manifestPath, + DockerSockHostPath: "/var/run/docker.sock", + CllamaCostsURL: firstIf(cllamaEnabled, fmt.Sprintf("http://localhost:%s", cllamaDashboardPort)), + PodName: p.Name, + } + // Pass 2: materialize after cllama tokens/context are resolved. for _, name := range sortedResolvedClawNames(resolvedClaws) { rc := resolvedClaws[name] @@ -783,6 +800,21 @@ func shortContainerIDForPostApply(id string) string { return id[:12] } +func envOrDefault(key, fallback string) string { + v := strings.TrimSpace(os.Getenv(key)) + if v == "" { + return fallback + } + return v +} + +func firstIf(ok bool, value string) string { + if ok { + return value + } + return "" +} + // resolveChannelID looks up a channel by name in the discord handle's guild topology. // Returns the channel ID if found, empty string otherwise. // Searches all guilds in the discord handle. diff --git a/cmd/claw/spike_test.go b/cmd/claw/spike_test.go index 52cb369..9a18ad1 100644 --- a/cmd/claw/spike_test.go +++ b/cmd/claw/spike_test.go @@ -199,8 +199,8 @@ func TestSpikeComposeUp(t *testing.T) { if !ok { t.Fatalf("openclaw.json: missing models.providers.%s object", provider) } - if got := entry["baseUrl"]; got != "http://cllama-passthrough:8080/v1" { - t.Errorf("openclaw.json: expected models.providers.%s.baseUrl=http://cllama-passthrough:8080/v1, got %v", provider, got) + if got := entry["baseUrl"]; got != "http://cllama:8080/v1" { + t.Errorf("openclaw.json: expected models.providers.%s.baseUrl=http://cllama:8080/v1, got %v", provider, got) } providerToken, _ := entry["apiKey"].(string) if matched, _ := regexp.MatchString(`^tiverton:[0-9a-f]{48}$`, providerToken); !matched { @@ -296,8 +296,8 @@ func TestSpikeComposeUp(t *testing.T) { t.Errorf("compose.generated.yml: expected to contain %q", want) } } - if !strings.Contains(composeSrc, "cllama-passthrough:") { - t.Errorf("compose.generated.yml: expected cllama-passthrough service") + if !strings.Contains(composeSrc, "cllama:") { + t.Errorf("compose.generated.yml: expected cllama service") } if !strings.Contains(composeSrc, "CLAW_CONTEXT_ROOT: /claw/context") { t.Errorf("compose.generated.yml: expected cllama context root env") @@ -442,7 +442,7 @@ func TestSpikeComposeUp(t *testing.T) { t.Errorf("allen: ANTHROPIC_BASE_URL not set: %v", errE) } else { allenBaseURL := strings.TrimSpace(string(allenEnvOut)) - if !strings.Contains(allenBaseURL, "cllama-passthrough") { + if !strings.Contains(allenBaseURL, "cllama") { t.Errorf("allen: ANTHROPIC_BASE_URL should point to cllama proxy, got %q", allenBaseURL) } else { t.Logf("allen ANTHROPIC_BASE_URL: %s", allenBaseURL) @@ -518,8 +518,8 @@ func TestSpikeComposeUp(t *testing.T) { if got := microCfg["model"]; got != "claude-sonnet-4" { t.Errorf("microclaw.config.yaml: expected model=claude-sonnet-4, got %v", got) } - if got := microCfg["llm_base_url"]; got != "http://cllama-passthrough:8080/v1" { - t.Errorf("microclaw.config.yaml: expected llm_base_url=http://cllama-passthrough:8080/v1, got %v", got) + if got := microCfg["llm_base_url"]; got != "http://cllama:8080/v1" { + t.Errorf("microclaw.config.yaml: expected llm_base_url=http://cllama:8080/v1, got %v", got) } microContainer := spikeContainerName("micro") diff --git a/cmd/clawdash/handler.go b/cmd/clawdash/handler.go new file mode 100644 index 0000000..d2cb28e --- /dev/null +++ b/cmd/clawdash/handler.go @@ -0,0 +1,800 @@ +package main + +import ( + "bufio" + "bytes" + "context" + "embed" + "encoding/json" + "fmt" + "html/template" + "io" + "net/http" + "net/url" + "os" + "slices" + "sort" + "strconv" + "strings" + "time" + + containerapi "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/filters" + "github.com/docker/docker/client" + "github.com/docker/docker/pkg/stdcopy" + manifestpkg "github.com/mostlydev/clawdapus/internal/clawdash" + "github.com/mostlydev/clawdapus/internal/cllama" + "github.com/mostlydev/clawdapus/internal/driver" +) + +//go:embed templates/*.html +var templateFS embed.FS + +type statusSource interface { + Snapshot(ctx context.Context, serviceNames []string) (map[string]serviceStatus, error) +} + +type handler struct { + manifest *manifestpkg.PodManifest + statusSource statusSource + cllamaCostsURL string + costLogFallback bool + httpClient *http.Client + tpl *template.Template +} + +func newHandler(manifest *manifestpkg.PodManifest, source statusSource, cllamaCostsURL string, costLogFallback bool) http.Handler { + funcs := template.FuncMap{ + "statusClass": statusClass, + "pathEscape": url.PathEscape, + "join": strings.Join, + "title": strings.Title, //nolint:staticcheck // simple title-case for badges. + "truncate": truncate, + "statusLabel": statusLabel, + "hasStatusData": hasStatusData, + } + tpl := template.Must(template.New("clawdash").Funcs(funcs).ParseFS(templateFS, "templates/*.html")) + return &handler{ + manifest: manifest, + statusSource: source, + cllamaCostsURL: strings.TrimSpace(cllamaCostsURL), + costLogFallback: costLogFallback, + httpClient: &http.Client{ + Timeout: 2 * time.Second, + }, + tpl: tpl, + } +} + +func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/": + h.renderFleet(w, r) + return + case r.Method == http.MethodGet && r.URL.Path == "/topology": + h.renderTopology(w, r) + return + case r.Method == http.MethodGet && strings.HasPrefix(r.URL.Path, "/detail/"): + h.renderDetail(w, r) + return + case r.Method == http.MethodGet && r.URL.Path == "/api/status": + h.renderAPIStatus(w, r) + return + case r.Method == http.MethodGet && r.URL.Path == "/healthz": + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok")) + return + default: + http.NotFound(w, r) + return + } +} + +type fleetPageData struct { + PodName string + ActiveTab string + Agents []fleetCard + Proxies []fleetCard + Infrastructure []fleetCard + HasCllama bool + CllamaCostsURL string + HasCostLink bool + HasCostSummary bool + CostSummary cllamaCostSummary + CostSummaryErr string + StatusError string + HasStatusErrors bool +} + +type cllamaCostSummary struct { + TotalCostUSD float64 + Requests int + ProxyCount int + Source string +} + +type fleetCard struct { + ServiceName string + DetailPath string + RoleBadge string + RoleClass string + ClawType string + Status string + StatusClass string + Uptime string + Model string + Handles []handleRow + ProxyType string + Count int + RunningCount int +} + +type handleRow struct { + Platform string + Username string +} + +func (h *handler) renderFleet(w http.ResponseWriter, r *http.Request) { + statuses, statusErr := h.snapshot(r.Context()) + data := h.buildFleetPageData(r.Context(), statuses, statusErr) + w.Header().Set("Content-Type", "text/html; charset=utf-8") + _ = h.tpl.ExecuteTemplate(w, "fleet.html", data) +} + +func (h *handler) buildFleetPageData(ctx context.Context, statuses map[string]serviceStatus, statusErr string) fleetPageData { + serviceNames := sortedServiceNames(h.manifest.Services) + proxyByService := make(map[string]manifestpkg.ProxyManifest, len(h.manifest.Proxies)) + for _, p := range h.manifest.Proxies { + proxyByService[p.ServiceName] = p + } + + agents := make([]fleetCard, 0) + infra := make([]fleetCard, 0) + for _, name := range serviceNames { + svc := h.manifest.Services[name] + status := statuses[name] + card := fleetCard{ + ServiceName: name, + DetailPath: "/detail/" + url.PathEscape(name), + Status: status.Status, + StatusClass: statusClass(status.Status), + Uptime: status.Uptime, + Model: primaryModel(svc.Models), + Handles: sortedHandles(svc.Handles), + Count: svc.Count, + RunningCount: status.Running, + } + if card.Count < 1 { + card.Count = 1 + } + + if svc.ClawType != "" { + card.RoleBadge = svc.ClawType + card.RoleClass = "badge-cyan" + card.ClawType = svc.ClawType + card.ProxyType = joinNonEmpty(svc.Cllama, ", ") + agents = append(agents, card) + continue + } + + if proxy, ok := proxyByService[name]; ok { + card.RoleBadge = "proxy" + card.RoleClass = "badge-amber" + card.ProxyType = proxy.ProxyType + agents = append(agents, card) + continue + } + + card.RoleBadge = "native" + card.RoleClass = "badge-green" + infra = append(infra, card) + } + + proxies := make([]fleetCard, 0, len(h.manifest.Proxies)) + for _, proxy := range h.manifest.Proxies { + status := statuses[proxy.ServiceName] + proxies = append(proxies, fleetCard{ + ServiceName: proxy.ServiceName, + DetailPath: "/detail/" + url.PathEscape(proxy.ServiceName), + RoleBadge: "proxy", + RoleClass: "badge-amber", + Status: status.Status, + StatusClass: statusClass(status.Status), + Uptime: status.Uptime, + ProxyType: proxy.ProxyType, + Count: 1, + }) + } + sort.Slice(proxies, func(i, j int) bool { return proxies[i].ServiceName < proxies[j].ServiceName }) + + costSummary, costErr := h.fetchCllamaCostSummary(ctx) + + return fleetPageData{ + PodName: h.manifest.PodName, + ActiveTab: "fleet", + Agents: agents, + Proxies: proxies, + Infrastructure: infra, + HasCllama: len(proxies) > 0, + CllamaCostsURL: h.cllamaCostsURL, + HasCostLink: costSummary != nil && costSummary.Source == "api" && strings.TrimSpace(h.cllamaCostsURL) != "", + HasCostSummary: costSummary != nil, + CostSummary: firstCostSummary(costSummary), + CostSummaryErr: costErr, + StatusError: statusErr, + HasStatusErrors: statusErr != "", + } +} + +type detailPageData struct { + PodName string + ActiveTab string + ServiceName string + ImageRef string + Count int + IsProxy bool + Status serviceStatus + StatusClass string + StatusError string + Surfaces []manifestpkg.SurfaceManifest + Handles []handleDetailRow + Skills []string + Invocations []driver.Invocation + Models []modelRow + Cllama []cllamaDetailRow + HasStatusErrors bool +} + +type handleDetailRow struct { + Platform string + Username string + ID string + Guilds []driver.GuildInfo +} + +type modelRow struct { + Slot string + Model string +} + +type cllamaDetailRow struct { + ProxyType string + ServiceName string + TokenStatus string +} + +func (h *handler) renderDetail(w http.ResponseWriter, r *http.Request) { + raw := strings.TrimPrefix(r.URL.Path, "/detail/") + name, err := url.PathUnescape(raw) + if err != nil || strings.TrimSpace(name) == "" { + http.NotFound(w, r) + return + } + + statuses, statusErr := h.snapshot(r.Context()) + data, ok := h.buildDetailPageData(name, statuses, statusErr) + if !ok { + http.NotFound(w, r) + return + } + + w.Header().Set("Content-Type", "text/html; charset=utf-8") + _ = h.tpl.ExecuteTemplate(w, "detail.html", data) +} + +func (h *handler) buildDetailPageData(name string, statuses map[string]serviceStatus, statusErr string) (detailPageData, bool) { + svc, ok := h.manifest.Services[name] + proxyInfo, isProxy := h.proxyByServiceName(name) + if !ok && !isProxy { + return detailPageData{}, false + } + + if !ok && isProxy { + svc = manifestpkg.ServiceManifest{ + ImageRef: proxyInfo.Image, + Count: 1, + } + } + if svc.Count < 1 { + svc.Count = 1 + } + + models := make([]modelRow, 0, len(svc.Models)) + for slot, modelRef := range svc.Models { + models = append(models, modelRow{Slot: slot, Model: modelRef}) + } + sort.Slice(models, func(i, j int) bool { return models[i].Slot < models[j].Slot }) + + handleRows := make([]handleDetailRow, 0, len(svc.Handles)) + for platform, info := range svc.Handles { + if info == nil { + continue + } + handleRows = append(handleRows, handleDetailRow{ + Platform: platform, + Username: info.Username, + ID: info.ID, + Guilds: info.Guilds, + }) + } + sort.Slice(handleRows, func(i, j int) bool { return handleRows[i].Platform < handleRows[j].Platform }) + + cllamaRows := make([]cllamaDetailRow, 0) + proxyByType := make(map[string]string, len(h.manifest.Proxies)) + for _, p := range h.manifest.Proxies { + proxyByType[p.ProxyType] = p.ServiceName + } + tokenStatus := "absent" + if statuses[name].HasCllamaToken { + tokenStatus = "present" + } + for _, proxyType := range svc.Cllama { + serviceName := proxyByType[proxyType] + if serviceName == "" { + serviceName = cllama.ProxyServiceName(proxyType) + } + cllamaRows = append(cllamaRows, cllamaDetailRow{ + ProxyType: proxyType, + ServiceName: serviceName, + TokenStatus: tokenStatus, + }) + } + if isProxy { + cllamaRows = append(cllamaRows, cllamaDetailRow{ + ProxyType: proxyInfo.ProxyType, + ServiceName: proxyInfo.ServiceName, + TokenStatus: "absent", + }) + } + + status := statuses[name] + if status.Service == "" { + status = unknownStatus(name) + } + + return detailPageData{ + PodName: h.manifest.PodName, + ActiveTab: "detail", + ServiceName: name, + ImageRef: firstNonEmpty(svc.ImageRef, proxyInfo.Image), + Count: svc.Count, + IsProxy: isProxy, + Status: status, + StatusClass: statusClass(status.Status), + StatusError: statusErr, + Surfaces: svc.Surfaces, + Handles: handleRows, + Skills: slices.Clone(svc.Skills), + Invocations: slices.Clone(svc.Invocations), + Models: models, + Cllama: cllamaRows, + HasStatusErrors: statusErr != "", + }, true +} + +func (h *handler) renderTopology(w http.ResponseWriter, r *http.Request) { + statuses, statusErr := h.snapshot(r.Context()) + data := buildTopologyPageData(h.manifest, statuses, statusErr) + w.Header().Set("Content-Type", "text/html; charset=utf-8") + _ = h.tpl.ExecuteTemplate(w, "topology.html", data) +} + +type apiStatusResponse struct { + GeneratedAt string `json:"generatedAt"` + Services map[string]serviceStatus `json:"services"` + Error string `json:"error,omitempty"` +} + +func (h *handler) renderAPIStatus(w http.ResponseWriter, r *http.Request) { + statuses, err := h.snapshot(r.Context()) + resp := apiStatusResponse{ + GeneratedAt: time.Now().UTC().Format(time.RFC3339), + Services: statuses, + } + code := http.StatusOK + if err != "" { + resp.Error = err + code = http.StatusServiceUnavailable + } + w.Header().Set("Content-Type", "application/json; charset=utf-8") + w.WriteHeader(code) + _ = json.NewEncoder(w).Encode(resp) +} + +func (h *handler) snapshot(ctx context.Context) (map[string]serviceStatus, string) { + names := h.allServiceNames() + timeoutCtx, cancel := context.WithTimeout(ctx, 4*time.Second) + defer cancel() + + statuses, err := h.statusSource.Snapshot(timeoutCtx, names) + if err == nil { + return statuses, "" + } + fallback := make(map[string]serviceStatus, len(names)) + for _, name := range names { + fallback[name] = unknownStatus(name) + } + return fallback, fmt.Sprintf("live status unavailable: %v", err) +} + +func (h *handler) allServiceNames() []string { + set := make(map[string]struct{}, len(h.manifest.Services)+len(h.manifest.Proxies)) + for name := range h.manifest.Services { + set[name] = struct{}{} + } + for _, proxy := range h.manifest.Proxies { + if strings.TrimSpace(proxy.ServiceName) != "" { + set[proxy.ServiceName] = struct{}{} + } + } + names := make([]string, 0, len(set)) + for name := range set { + names = append(names, name) + } + sort.Strings(names) + return names +} + +func (h *handler) proxyByServiceName(name string) (manifestpkg.ProxyManifest, bool) { + for _, proxy := range h.manifest.Proxies { + if proxy.ServiceName == name { + return proxy, true + } + } + return manifestpkg.ProxyManifest{}, false +} + +func readManifest(path string) (*manifestpkg.PodManifest, error) { + raw, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var manifest manifestpkg.PodManifest + if err := json.Unmarshal(raw, &manifest); err != nil { + return nil, err + } + if manifest.Services == nil { + manifest.Services = make(map[string]manifestpkg.ServiceManifest) + } + return &manifest, nil +} + +func sortedServiceNames(services map[string]manifestpkg.ServiceManifest) []string { + names := make([]string, 0, len(services)) + for name := range services { + names = append(names, name) + } + sort.Strings(names) + return names +} + +func sortedHandles(handles map[string]*driver.HandleInfo) []handleRow { + out := make([]handleRow, 0, len(handles)) + for platform, info := range handles { + if info == nil { + continue + } + username := info.Username + if strings.TrimSpace(username) == "" { + username = info.ID + } + out = append(out, handleRow{ + Platform: platform, + Username: username, + }) + } + sort.Slice(out, func(i, j int) bool { return out[i].Platform < out[j].Platform }) + return out +} + +func primaryModel(models map[string]string) string { + if len(models) == 0 { + return "" + } + if primary := strings.TrimSpace(models["primary"]); primary != "" { + return primary + } + keys := make([]string, 0, len(models)) + for k := range models { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + if strings.TrimSpace(models[k]) != "" { + return models[k] + } + } + return "" +} + +func statusClass(status string) string { + switch strings.ToLower(strings.TrimSpace(status)) { + case "healthy", "running": + return "status-healthy" + case "starting": + return "status-starting" + case "unhealthy", "stopped", "dead", "exited": + return "status-unhealthy" + default: + return "status-unknown" + } +} + +func statusLabel(status string) string { + s := strings.TrimSpace(status) + if s == "" { + return "unknown" + } + return s +} + +func hasStatusData(value string) bool { + return strings.TrimSpace(value) != "" +} + +func truncate(s string, n int) string { + if n <= 0 { + return "" + } + if len(s) <= n { + return s + } + if n <= 3 { + return s[:n] + } + return s[:n-3] + "..." +} + +func joinNonEmpty(items []string, sep string) string { + out := make([]string, 0, len(items)) + for _, item := range items { + item = strings.TrimSpace(item) + if item == "" { + continue + } + out = append(out, item) + } + return strings.Join(out, sep) +} + +func firstNonEmpty(values ...string) string { + for _, v := range values { + if strings.TrimSpace(v) != "" { + return v + } + } + return "" +} + +func firstCostSummary(in *cllamaCostSummary) cllamaCostSummary { + if in == nil { + return cllamaCostSummary{} + } + return *in +} + +func (h *handler) fetchCllamaCostSummary(ctx context.Context) (*cllamaCostSummary, string) { + if len(h.manifest.Proxies) == 0 { + return nil, "" + } + summary, apiErr := h.fetchCllamaCostSummaryFromAPI(ctx) + if summary != nil { + summary.Source = "api" + return summary, "" + } + if !h.costLogFallback { + return nil, apiErr + } + if summary, err := h.fetchCllamaCostSummaryFromLogs(ctx); summary != nil { + summary.Source = "logs" + if strings.TrimSpace(apiErr) != "" { + return summary, fmt.Sprintf("cost API unavailable (%s); showing log-derived estimate", apiErr) + } + if strings.TrimSpace(err) != "" { + return summary, err + } + return summary, "showing log-derived estimate" + } + if strings.TrimSpace(apiErr) != "" { + return nil, apiErr + } + return nil, "no cllama cost data available" +} + +func (h *handler) fetchCllamaCostSummaryFromAPI(ctx context.Context) (*cllamaCostSummary, string) { + summary := &cllamaCostSummary{} + success := 0 + lastErr := "" + + for _, proxy := range h.manifest.Proxies { + serviceName := strings.TrimSpace(proxy.ServiceName) + if serviceName == "" { + continue + } + endpoint := fmt.Sprintf("http://%s:8081/costs/api", serviceName) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + lastErr = fmt.Sprintf("build request for %s: %v", serviceName, err) + continue + } + + resp, err := h.httpClient.Do(req) + if err != nil { + lastErr = fmt.Sprintf("%s unavailable: %v", serviceName, err) + continue + } + + var payload map[string]interface{} + if resp.StatusCode == http.StatusOK { + if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { + lastErr = fmt.Sprintf("%s invalid JSON from /costs/api", serviceName) + _ = resp.Body.Close() + continue + } + } else { + lastErr = fmt.Sprintf("%s missing /costs/api (status %d)", serviceName, resp.StatusCode) + _ = resp.Body.Close() + continue + } + _ = resp.Body.Close() + + summary.TotalCostUSD += asFloat(payload["total_cost_usd"]) + summary.Requests += asInt(payload["total_requests"]) + success++ + } + + if success == 0 { + if strings.TrimSpace(lastErr) == "" { + lastErr = "no cllama cost emission endpoint detected" + } + return nil, lastErr + } + summary.ProxyCount = success + return summary, "" +} + +func (h *handler) fetchCllamaCostSummaryFromLogs(ctx context.Context) (*cllamaCostSummary, string) { + cli, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation()) + if err != nil { + return nil, fmt.Sprintf("docker client unavailable for cost log fallback: %v", err) + } + defer cli.Close() + + summary := &cllamaCostSummary{} + success := 0 + lastErr := "" + + for _, proxy := range h.manifest.Proxies { + serviceName := strings.TrimSpace(proxy.ServiceName) + if serviceName == "" { + continue + } + + containerID, err := findProxyContainerID(ctx, cli, h.manifest.PodName, serviceName) + if err != nil { + lastErr = fmt.Sprintf("%s container lookup failed: %v", serviceName, err) + continue + } + + rc, err := cli.ContainerLogs(ctx, containerID, containerapi.LogsOptions{ + ShowStdout: true, + ShowStderr: false, + Tail: "500", + }) + if err != nil { + lastErr = fmt.Sprintf("%s log read failed: %v", serviceName, err) + continue + } + + var stdout bytes.Buffer + var stderr bytes.Buffer + _, copyErr := stdcopy.StdCopy(&stdout, &stderr, rc) + _ = rc.Close() + if copyErr != nil && copyErr != io.EOF { + lastErr = fmt.Sprintf("%s log decode failed: %v", serviceName, copyErr) + continue + } + + total, reqs := parseCostSummaryFromLogs(stdout.String()) + summary.TotalCostUSD += total + summary.Requests += reqs + success++ + } + + if success == 0 { + if strings.TrimSpace(lastErr) == "" { + lastErr = "no proxy logs available for cost fallback" + } + return nil, lastErr + } + summary.ProxyCount = success + return summary, "" +} + +func findProxyContainerID(ctx context.Context, cli *client.Client, podName, serviceName string) (string, error) { + args := filters.NewArgs( + filters.Arg("label", "claw.pod="+strings.TrimSpace(podName)), + filters.Arg("label", "claw.service="+strings.TrimSpace(serviceName)), + ) + containers, err := cli.ContainerList(ctx, containerapi.ListOptions{ + All: true, + Filters: args, + }) + if err != nil { + return "", err + } + if len(containers) == 0 { + return "", fmt.Errorf("not found") + } + return containers[0].ID, nil +} + +func parseCostSummaryFromLogs(logs string) (float64, int) { + total := 0.0 + requests := 0 + scanner := bufio.NewScanner(strings.NewReader(logs)) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" || !strings.HasPrefix(line, "{") { + continue + } + var payload map[string]interface{} + if err := json.Unmarshal([]byte(line), &payload); err != nil { + continue + } + if _, ok := payload["cost_usd"]; !ok { + continue + } + total += asFloat(payload["cost_usd"]) + requests++ + } + return total, requests +} + +func asFloat(v interface{}) float64 { + switch n := v.(type) { + case float64: + return n + case float32: + return float64(n) + case int: + return float64(n) + case int64: + return float64(n) + case json.Number: + f, err := n.Float64() + if err == nil { + return f + } + case string: + f, err := strconv.ParseFloat(strings.TrimSpace(n), 64) + if err == nil { + return f + } + } + return 0 +} + +func asInt(v interface{}) int { + switch n := v.(type) { + case float64: + return int(n) + case float32: + return int(n) + case int: + return n + case int64: + return int(n) + case json.Number: + i, err := n.Int64() + if err == nil { + return int(i) + } + case string: + i, err := strconv.Atoi(strings.TrimSpace(n)) + if err == nil { + return i + } + } + return 0 +} diff --git a/cmd/clawdash/handler_test.go b/cmd/clawdash/handler_test.go new file mode 100644 index 0000000..9ffa694 --- /dev/null +++ b/cmd/clawdash/handler_test.go @@ -0,0 +1,200 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + manifestpkg "github.com/mostlydev/clawdapus/internal/clawdash" + "github.com/mostlydev/clawdapus/internal/driver" +) + +type fakeStatusSource struct { + statuses map[string]serviceStatus + err error +} + +func (f fakeStatusSource) Snapshot(_ context.Context, _ []string) (map[string]serviceStatus, error) { + if f.err != nil { + return nil, f.err + } + return f.statuses, nil +} + +func testManifest() *manifestpkg.PodManifest { + return &manifestpkg.PodManifest{ + PodName: "fleet", + Services: map[string]manifestpkg.ServiceManifest{ + "bot": { + ClawType: "openclaw", + ImageRef: "bot:latest", + Count: 1, + Surfaces: []manifestpkg.SurfaceManifest{ + {Scheme: "channel", Target: "discord"}, + {Scheme: "service", Target: "api"}, + {Scheme: "volume", Target: "shared-data"}, + }, + Cllama: []string{"passthrough"}, + Handles: map[string]*driver.HandleInfo{ + "discord": {ID: "123", Username: "fleet-bot"}, + }, + }, + "api": { + ImageRef: "api:latest", + Count: 1, + }, + }, + Proxies: []manifestpkg.ProxyManifest{ + {ProxyType: "passthrough", ServiceName: "cllama", Image: "cllama:latest"}, + }, + } +} + +func testStatuses() map[string]serviceStatus { + return map[string]serviceStatus{ + "bot": { + Service: "bot", + Status: "healthy", + State: "running", + Uptime: "3m 2s", + Instances: 1, + Running: 1, + }, + "api": { + Service: "api", + Status: "running", + State: "running", + Uptime: "8m 10s", + Instances: 1, + Running: 1, + }, + "cllama": { + Service: "cllama", + Status: "healthy", + State: "running", + Uptime: "3m 1s", + Instances: 1, + Running: 1, + }, + } +} + +func TestFleetPageRenders(t *testing.T) { + h := newHandler(testManifest(), fakeStatusSource{statuses: testStatuses()}, "http://localhost:8181", false) + req := httptest.NewRequest(http.MethodGet, "/", nil) + w := httptest.NewRecorder() + h.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + body := w.Body.String() + if !strings.Contains(body, "Fleet Overview") { + t.Fatalf("expected fleet heading in body") + } + if !strings.Contains(body, "bot") { + t.Fatalf("expected service name in body") + } + if !strings.Contains(body, "Costs") { + t.Fatalf("expected costs panel in body") + } + if !strings.Contains(body, "Cost emission not available yet") { + t.Fatalf("expected costs emission warning in body") + } + if strings.Contains(body, "Open cllama dashboard") { + t.Fatalf("expected costs link to be hidden when /costs/api is unavailable") + } +} + +func TestFleetPageShowsCostLinkWhenCostAPIAvailable(t *testing.T) { + raw := newHandler(testManifest(), fakeStatusSource{statuses: testStatuses()}, "http://localhost:8181", false) + h, ok := raw.(*handler) + if !ok { + t.Fatal("expected *handler") + } + h.httpClient = &http.Client{ + Timeout: time.Second, + Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) { + if req.URL.String() != "http://cllama:8081/costs/api" { + return nil, fmt.Errorf("unexpected URL: %s", req.URL.String()) + } + return &http.Response{ + StatusCode: http.StatusOK, + Header: make(http.Header), + Body: io.NopCloser(strings.NewReader(`{"total_cost_usd":1.2345,"total_requests":42}`)), + }, nil + }), + } + + req := httptest.NewRequest(http.MethodGet, "/", nil) + w := httptest.NewRecorder() + h.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + body := w.Body.String() + if !strings.Contains(body, "$1.2345") { + t.Fatalf("expected rendered API cost summary, got body:\n%s", body) + } + if !strings.Contains(body, "Open cllama dashboard") { + t.Fatalf("expected costs link when API summary is available") + } +} + +func TestTopologyPageRenders(t *testing.T) { + h := newHandler(testManifest(), fakeStatusSource{statuses: testStatuses()}, "http://localhost:8181", false) + req := httptest.NewRequest(http.MethodGet, "/topology", nil) + w := httptest.NewRecorder() + h.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + if !strings.Contains(w.Body.String(), "Topology") { + t.Fatalf("expected topology title in body") + } +} + +func TestAPIStatusJSON(t *testing.T) { + h := newHandler(testManifest(), fakeStatusSource{statuses: testStatuses()}, "http://localhost:8181", false) + req := httptest.NewRequest(http.MethodGet, "/api/status", nil) + w := httptest.NewRecorder() + h.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d body=%s", w.Code, w.Body.String()) + } + var payload struct { + Services map[string]serviceStatus `json:"services"` + } + if err := json.Unmarshal(w.Body.Bytes(), &payload); err != nil { + t.Fatalf("invalid json: %v", err) + } + if payload.Services["bot"].Status != "healthy" { + t.Fatalf("expected bot healthy, got %q", payload.Services["bot"].Status) + } +} + +func TestDetailMissingServiceNotFound(t *testing.T) { + h := newHandler(testManifest(), fakeStatusSource{statuses: testStatuses()}, "http://localhost:8181", false) + req := httptest.NewRequest(http.MethodGet, "/detail/missing", nil) + w := httptest.NewRecorder() + h.ServeHTTP(w, req) + + if w.Code != http.StatusNotFound { + t.Fatalf("expected 404, got %d", w.Code) + } +} + +type roundTripFunc func(*http.Request) (*http.Response, error) + +func (f roundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) { + return f(req) +} diff --git a/cmd/clawdash/main.go b/cmd/clawdash/main.go new file mode 100644 index 0000000..4df53a4 --- /dev/null +++ b/cmd/clawdash/main.go @@ -0,0 +1,125 @@ +package main + +import ( + "context" + "errors" + "fmt" + "net/http" + "os" + "os/signal" + "strings" + "syscall" + "time" +) + +func main() { + cfg := loadConfig() + + if len(os.Args) > 1 && strings.TrimSpace(os.Args[1]) == "-healthcheck" { + if err := runHealthcheck(cfg); err != nil { + fmt.Fprintln(os.Stderr, err.Error()) + os.Exit(1) + } + return + } + + if err := run(cfg); err != nil { + fmt.Fprintln(os.Stderr, err.Error()) + os.Exit(1) + } +} + +type config struct { + Addr string + ManifestPath string + CllamaCostsURL string + CostLogFallback bool +} + +func loadConfig() config { + return config{ + Addr: envOr("CLAWDASH_ADDR", ":8082"), + ManifestPath: envOr("CLAWDASH_MANIFEST", "/claw/pod-manifest.json"), + CllamaCostsURL: strings.TrimSpace(os.Getenv("CLAWDASH_CLLAMA_COSTS_URL")), + CostLogFallback: envBool( + "CLAWDASH_COST_LOG_FALLBACK", + ), + } +} + +func run(cfg config) error { + manifest, err := readManifest(cfg.ManifestPath) + if err != nil { + return fmt.Errorf("clawdash: read manifest: %w", err) + } + + source, err := newDockerStatusSource(manifest.PodName) + if err != nil { + return fmt.Errorf("clawdash: docker client: %w", err) + } + defer source.Close() + + h := newHandler(manifest, source, cfg.CllamaCostsURL, cfg.CostLogFallback) + srv := &http.Server{ + Addr: cfg.Addr, + Handler: h, + ReadHeaderTimeout: 10 * time.Second, + } + + errCh := make(chan error, 1) + go func() { + fmt.Fprintf(os.Stderr, "clawdash ui listening on %s\n", cfg.Addr) + errCh <- srv.ListenAndServe() + }() + + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM) + + select { + case sig := <-sigCh: + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = sig + return srv.Shutdown(ctx) + case err := <-errCh: + if errors.Is(err, http.ErrServerClosed) { + return nil + } + return err + } +} + +func runHealthcheck(cfg config) error { + manifest, err := readManifest(cfg.ManifestPath) + if err != nil { + return fmt.Errorf("clawdash healthcheck: read manifest: %w", err) + } + if strings.TrimSpace(manifest.PodName) == "" { + return fmt.Errorf("clawdash healthcheck: manifest podName is empty") + } + source, err := newDockerStatusSource(manifest.PodName) + if err != nil { + return fmt.Errorf("clawdash healthcheck: docker client: %w", err) + } + defer source.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + if err := source.Ping(ctx); err != nil { + return fmt.Errorf("clawdash healthcheck: docker ping failed: %w", err) + } + return nil +} + +func envOr(key, fallback string) string { + v := strings.TrimSpace(os.Getenv(key)) + if v == "" { + return fallback + } + return v +} + +func envBool(key string) bool { + v := strings.ToLower(strings.TrimSpace(os.Getenv(key))) + return v == "1" || v == "true" || v == "yes" || v == "on" +} diff --git a/cmd/clawdash/status.go b/cmd/clawdash/status.go new file mode 100644 index 0000000..5f1cb0a --- /dev/null +++ b/cmd/clawdash/status.go @@ -0,0 +1,290 @@ +package main + +import ( + "context" + "fmt" + "sort" + "strings" + "time" + + "github.com/docker/docker/api/types" + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/filters" + "github.com/docker/docker/client" +) + +type serviceStatus struct { + Service string `json:"service"` + Status string `json:"status"` + State string `json:"state"` + Health string `json:"health,omitempty"` + Uptime string `json:"uptime"` + ContainerID string `json:"containerId,omitempty"` + Instances int `json:"instances"` + Running int `json:"running"` + HasCllamaToken bool `json:"hasCllamaToken,omitempty"` +} + +type dockerStatusSource struct { + podName string + cli *client.Client + now func() time.Time +} + +type instance struct { + id string + status string + state string + health string + startedAt time.Time + running bool + hasCllamaToken bool +} + +func newDockerStatusSource(podName string) (*dockerStatusSource, error) { + cli, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation()) + if err != nil { + return nil, err + } + return &dockerStatusSource{ + podName: podName, + cli: cli, + now: time.Now, + }, nil +} + +func (d *dockerStatusSource) Close() error { + return d.cli.Close() +} + +func (d *dockerStatusSource) Ping(ctx context.Context) error { + _, err := d.cli.Ping(ctx) + return err +} + +func (d *dockerStatusSource) Snapshot(ctx context.Context, serviceNames []string) (map[string]serviceStatus, error) { + nameSet := make(map[string]struct{}, len(serviceNames)) + out := make(map[string]serviceStatus, len(serviceNames)) + for _, name := range serviceNames { + nameSet[name] = struct{}{} + out[name] = unknownStatus(name) + } + + args := filters.NewArgs(filters.Arg("label", "claw.pod="+d.podName)) + containers, err := d.cli.ContainerList(ctx, container.ListOptions{ + All: true, + Filters: args, + }) + if err != nil { + return nil, err + } + + buckets := make(map[string][]instance) + for _, c := range containers { + serviceName := serviceNameFromLabels(c.Labels, c.Names) + if serviceName == "" { + continue + } + if _, ok := nameSet[serviceName]; !ok { + continue + } + + inspect, err := d.cli.ContainerInspect(ctx, c.ID) + if err != nil { + continue + } + inst := containerToInstance(inspect) + buckets[serviceName] = append(buckets[serviceName], inst) + } + + now := d.now() + for serviceName, instances := range buckets { + out[serviceName] = aggregateInstances(serviceName, instances, now) + } + + return out, nil +} + +func unknownStatus(service string) serviceStatus { + return serviceStatus{ + Service: service, + Status: "unknown", + State: "unknown", + Uptime: "-", + Instances: 0, + Running: 0, + } +} + +func serviceNameFromLabels(labels map[string]string, names []string) string { + if labels == nil { + labels = map[string]string{} + } + if v := strings.TrimSpace(labels["claw.service"]); v != "" { + return v + } + if v := strings.TrimSpace(labels["com.docker.compose.service"]); v != "" { + return v + } + if len(names) > 0 { + return strings.TrimPrefix(names[0], "/") + } + return "" +} + +func containerToInstance(info types.ContainerJSON) instance { + state := "unknown" + health := "" + running := false + startedAt := time.Time{} + + if info.ContainerJSONBase != nil && info.State != nil { + state = strings.ToLower(strings.TrimSpace(info.State.Status)) + running = info.State.Running + if info.State.Health != nil { + health = strings.ToLower(strings.TrimSpace(info.State.Health.Status)) + } + if started := strings.TrimSpace(info.State.StartedAt); started != "" { + if ts, err := time.Parse(time.RFC3339Nano, started); err == nil { + startedAt = ts + } + } + } + + hasToken := false + if info.Config != nil { + for _, raw := range info.Config.Env { + k, v, ok := strings.Cut(raw, "=") + if !ok { + continue + } + if strings.TrimSpace(k) == "CLLAMA_TOKEN" && strings.TrimSpace(v) != "" { + hasToken = true + break + } + } + } + + return instance{ + id: info.ID, + status: normalizeStatus(state, running, health), + state: state, + health: health, + startedAt: startedAt, + running: running, + hasCllamaToken: hasToken, + } +} + +func aggregateInstances(service string, instances []instance, now time.Time) serviceStatus { + if len(instances) == 0 { + return unknownStatus(service) + } + + sort.Slice(instances, func(i, j int) bool { + return statusSeverity(instances[i].status) > statusSeverity(instances[j].status) + }) + worst := instances[0] + + running := 0 + hasToken := false + longest := time.Duration(0) + for _, inst := range instances { + if inst.running { + running++ + if !inst.startedAt.IsZero() { + if dur := now.Sub(inst.startedAt); dur > longest { + longest = dur + } + } + } + if inst.hasCllamaToken { + hasToken = true + } + } + + uptime := "-" + if longest > 0 { + uptime = formatDuration(longest) + } + + return serviceStatus{ + Service: service, + Status: worst.status, + State: worst.state, + Health: worst.health, + Uptime: uptime, + ContainerID: shortID(worst.id), + Instances: len(instances), + Running: running, + HasCllamaToken: hasToken, + } +} + +func normalizeStatus(state string, running bool, health string) string { + if running { + if health == "healthy" || health == "unhealthy" || health == "starting" { + return health + } + return "running" + } + + switch state { + case "restarting", "created", "paused": + return "starting" + case "dead", "exited", "removing", "": + return "stopped" + default: + return state + } +} + +func statusSeverity(status string) int { + switch status { + case "healthy": + return 0 + case "running": + return 1 + case "starting": + return 2 + case "unknown": + return 2 + case "unhealthy": + return 3 + case "stopped": + return 4 + default: + return 3 + } +} + +func formatDuration(d time.Duration) string { + if d < 0 { + d = 0 + } + d = d.Round(time.Second) + h := int(d / time.Hour) + d -= time.Duration(h) * time.Hour + m := int(d / time.Minute) + d -= time.Duration(m) * time.Minute + s := int(d / time.Second) + + if h > 0 { + return fmt.Sprintf("%dh %dm", h, m) + } + if m > 0 { + return fmt.Sprintf("%dm %ds", m, s) + } + return fmt.Sprintf("%ds", s) +} + +func shortID(id string) string { + id = strings.TrimSpace(id) + if id == "" { + return "" + } + if len(id) <= 12 { + return id + } + return id[:12] +} diff --git a/cmd/clawdash/templates/detail.html b/cmd/clawdash/templates/detail.html new file mode 100644 index 0000000..d917db0 --- /dev/null +++ b/cmd/clawdash/templates/detail.html @@ -0,0 +1,573 @@ + + + + + + clawdapus dash - detail + + + + + + +
+ CLAWDAPUS DASH {{.PodName}} + +
+
+ +
+
Fleet / {{.ServiceName}}
+
+
+

{{.ServiceName}}

+
image: {{if .ImageRef}}{{.ImageRef}}{{else}}-{{end}} {{if gt .Count 1}}| replicas: {{.Count}}{{end}}
+
+ {{if .IsProxy}} + proxy + {{end}} +
+ + {{if .HasStatusErrors}} + + {{end}} + +
+
+ Status +
+
+
+
Health
+
+ + + {{statusLabel .Status.Status}} + +
+
+
+
Uptime
+
{{.Status.Uptime}}
+
+
+
Container
+
{{if .Status.ContainerID}}{{.Status.ContainerID}}{{else}}-{{end}}
+
+
+
Instances
+
{{.Status.Running}} / {{.Status.Instances}}
+
+
+
+
+
+ + {{if .Surfaces}} +
+
+ Surfaces +
+ + + + + + {{range .Surfaces}} + + + + + + + {{end}} + +
SchemeTargetAccessPorts
{{.Scheme}}{{.Target}}{{if .AccessMode}}{{.AccessMode}}{{else}}-{{end}}{{if .Ports}}{{join .Ports ", "}}{{else}}-{{end}}
+
+
+
+ {{end}} + + {{if .Handles}} +
+
+ Handles +
+ + + + + + {{range .Handles}} + + + + + + + {{end}} + +
PlatformUsernameIDGuilds
{{.Platform}}{{if .Username}}{{.Username}}{{else}}-{{end}}{{.ID}} + {{if .Guilds}} + {{range .Guilds}} +
+
{{if .Name}}{{.Name}}{{else}}guild{{end}} ({{.ID}})
+
{{if .Channels}}{{range .Channels}}{{if .Name}}{{.Name}}{{else}}{{.ID}}{{end}} ({{.ID}})
{{end}}{{else}}No channels{{end}}
+
+ {{end}} + {{else}} + - + {{end}} +
+
+
+
+ {{end}} + + {{if .Skills}} +
+
+ Skills +
+
    + {{range .Skills}} +
  • {{.}}
  • + {{end}} +
+
+
+
+ {{end}} + + {{if .Invocations}} +
+
+ Invoke +
+ + + + + + {{range .Invocations}} + + + + + + + {{end}} + +
NameScheduleMessageTo
{{if .Name}}{{.Name}}{{else}}-{{end}}{{if .Schedule}}{{.Schedule}}{{else}}-{{end}}{{if .Message}}{{truncate .Message 120}}{{else}}-{{end}}{{if .To}}{{.To}}{{else}}-{{end}}
+
+
+
+ {{end}} + + {{if .Models}} +
+
+ Models +
+ + + + + + {{range .Models}} + + + + + {{end}} + +
SlotProvider / Model
{{.Slot}}{{.Model}}
+
+
+
+ {{end}} + + {{if .Cllama}} +
+
+ Cllama +
+ + + + + + {{range .Cllama}} + + + + + + {{end}} + +
Proxy TypeProxy ServiceToken
{{.ProxyType}}{{.ServiceName}}{{.TokenStatus}}
+
+
+
+ {{end}} + + +
+ + + + diff --git a/cmd/clawdash/templates/fleet.html b/cmd/clawdash/templates/fleet.html new file mode 100644 index 0000000..a956121 --- /dev/null +++ b/cmd/clawdash/templates/fleet.html @@ -0,0 +1,638 @@ + + + + + + clawdapus dash - fleet + + + + + + +
+ CLAWDAPUS DASH {{.PodName}} + + +
+ +
+

Fleet Overview

+

Live view of agents, proxies, and infrastructure for pod {{.PodName}}.

+ {{if not .HasCllama}} +

This pod has no cllama proxy, so the cllama costs dashboard link is hidden.

+ {{end}} + + {{if .HasStatusErrors}} + + {{end}} + + {{if .HasCllama}} +
+
+

Costs

+ cllama +
+
+ {{if .HasCostSummary}} +
+
+
Total USD
+
${{printf "%.4f" .CostSummary.TotalCostUSD}}
+
+
+
Requests
+
{{.CostSummary.Requests}}
+
+
+
Proxies Reporting
+
{{.CostSummary.ProxyCount}}
+
+
+ {{if .CostSummaryErr}} +

{{.CostSummaryErr}}

+ {{end}} + {{else}} +

Cost emission not available yet: {{.CostSummaryErr}}

+ {{end}} + {{if .HasCostLink}} + Open cllama dashboard + {{end}} +
+
+ {{end}} + +
+
+

Agents

+ {{len .Agents}} +
+ {{if .Agents}} + + {{else}} +
No claw agents found in this pod manifest.
+ {{end}} +
+ + {{if .HasCllama}} +
+
+

Proxies

+ {{len .Proxies}} +
+ +
+ {{end}} + +
+
+

Infrastructure

+ {{len .Infrastructure}} +
+ {{if .Infrastructure}} + + {{else}} +
No native infrastructure services detected.
+ {{end}} +
+ + +
+ + + + diff --git a/cmd/clawdash/templates/topology.html b/cmd/clawdash/templates/topology.html new file mode 100644 index 0000000..f1b24bb --- /dev/null +++ b/cmd/clawdash/templates/topology.html @@ -0,0 +1,382 @@ + + + + + + clawdapus dash - topology + + + + + + +
+ CLAWDAPUS DASH {{.PodName}} + +
+
+ +
+

Topology

+

+ Channels, agents, {{if .HasCllama}}proxies, {{end}}services, and volumes with live health markers. +

+ + {{if .HasStatusErrors}} + + {{end}} + +
+
+ {{range .Lanes}} +
{{.Title}}
+ {{end}} +
+ + {{if .HasNodes}} +
+ + {{range .Edges}} + + {{end}} + + + {{range .Nodes}} +
+ {{.Label}} + +
+ {{end}} +
+
+ Hover a node to isolate connected paths. Colors: cyan channel, amber service, green volume/host{{if .HasCllama}}, purple proxy{{end}}. +
+ {{else}} +
No topology nodes are available for this pod snapshot yet.
+ {{end}} + + +
+
+ + + + diff --git a/cmd/clawdash/topology.go b/cmd/clawdash/topology.go new file mode 100644 index 0000000..29b1829 --- /dev/null +++ b/cmd/clawdash/topology.go @@ -0,0 +1,340 @@ +package main + +import ( + "fmt" + "sort" + "strings" + + manifestpkg "github.com/mostlydev/clawdapus/internal/clawdash" + "github.com/mostlydev/clawdapus/internal/cllama" +) + +type topologyPageData struct { + PodName string + ActiveTab string + Lanes []topologyLane + CanvasWidth int + CanvasHeight int + Nodes []topologyNode + Edges []topologyEdge + HasNodes bool + HasCllama bool + StatusError string + HasStatusErrors bool +} + +type topologyLane struct { + Key string + Title string +} + +type topologyNode struct { + ID string + Label string + Lane string + ServiceName string + X int + Y int + Width int + Height int + Status string + StatusClass string + Neighbors string +} + +type topologyEdge struct { + FromID string + ToID string + Path string + Color string +} + +type edgeDef struct { + fromLane string + fromName string + toLane string + toName string + kind string +} + +func buildTopologyPageData(manifest *manifestpkg.PodManifest, statuses map[string]serviceStatus, statusErr string) topologyPageData { + proxyByType := make(map[string]string, len(manifest.Proxies)) + proxyNames := make([]string, 0, len(manifest.Proxies)) + for _, proxy := range manifest.Proxies { + proxyByType[proxy.ProxyType] = proxy.ServiceName + proxyNames = append(proxyNames, proxy.ServiceName) + } + sort.Strings(proxyNames) + + agentNames := make([]string, 0) + channelSet := map[string]struct{}{} + serviceSet := map[string]struct{}{} + volumeSet := map[string]struct{}{} + edgeDefs := make([]edgeDef, 0) + + serviceNames := sortedServiceNames(manifest.Services) + for _, serviceName := range serviceNames { + svc := manifest.Services[serviceName] + if svc.ClawType != "" { + agentNames = append(agentNames, serviceName) + } else { + serviceSet[serviceName] = struct{}{} + } + } + sort.Strings(agentNames) + + for _, agent := range agentNames { + svc := manifest.Services[agent] + for _, surface := range svc.Surfaces { + switch surface.Scheme { + case "channel": + channelSet[surface.Target] = struct{}{} + edgeDefs = append(edgeDefs, edgeDef{ + fromLane: "channel", fromName: surface.Target, + toLane: "agent", toName: agent, + kind: "channel", + }) + case "service": + serviceSet[surface.Target] = struct{}{} + edgeDefs = append(edgeDefs, edgeDef{ + fromLane: "agent", fromName: agent, + toLane: "service", toName: surface.Target, + kind: "service", + }) + case "volume": + volumeSet[surface.Target] = struct{}{} + edgeDefs = append(edgeDefs, edgeDef{ + fromLane: "agent", fromName: agent, + toLane: "volume", toName: surface.Target, + kind: "volume", + }) + case "host": + hostTarget := "host:" + surface.Target + volumeSet[hostTarget] = struct{}{} + edgeDefs = append(edgeDefs, edgeDef{ + fromLane: "agent", fromName: agent, + toLane: "volume", toName: hostTarget, + kind: "volume", + }) + } + } + + for _, proxyType := range svc.Cllama { + proxyService := proxyByType[proxyType] + if proxyService == "" { + proxyService = cllama.ProxyServiceName(proxyType) + if strings.TrimSpace(proxyService) != "" { + proxyNames = append(proxyNames, proxyService) + } + } + edgeDefs = append(edgeDefs, edgeDef{ + fromLane: "agent", fromName: agent, + toLane: "proxy", toName: proxyService, + kind: "proxy", + }) + } + } + + proxyNames = uniqueSorted(proxyNames) + channels := sortedSet(channelSet) + services := sortedSet(serviceSet) + volumes := sortedSet(volumeSet) + + const ( + nodeW = 172 + nodeH = 44 + xStart = 24 + yStart = 52 + laneGap = 220 + rowGap = 68 + canvasPad = 36 + minRows = 3 + ) + + lanesMeta := make([]topologyLane, 0, 5) + lanesMeta = append(lanesMeta, topologyLane{Key: "channel", Title: "Channels"}) + lanesMeta = append(lanesMeta, topologyLane{Key: "agent", Title: "Agents"}) + hasCllama := len(proxyNames) > 0 + if hasCllama { + lanesMeta = append(lanesMeta, topologyLane{Key: "proxy", Title: "Proxies"}) + } + lanesMeta = append(lanesMeta, topologyLane{Key: "service", Title: "Services"}) + lanesMeta = append(lanesMeta, topologyLane{Key: "volume", Title: "Volumes"}) + + laneX := make(map[string]int, len(lanesMeta)) + for i, lane := range lanesMeta { + laneX[lane.Key] = xStart + laneGap*i + } + + type laneNodes struct { + lane string + names []string + } + lanes := make([]laneNodes, 0, len(lanesMeta)) + for _, lane := range lanesMeta { + names := []string{} + switch lane.Key { + case "channel": + names = channels + case "agent": + names = agentNames + case "proxy": + names = proxyNames + case "service": + names = services + case "volume": + names = volumes + } + lanes = append(lanes, laneNodes{lane: lane.Key, names: names}) + } + + nodeMap := make(map[string]topologyNode) + nodes := make([]topologyNode, 0) + maxRows := minRows + for _, lane := range lanes { + if len(lane.names) > maxRows { + maxRows = len(lane.names) + } + for row, name := range lane.names { + serviceName := "" + switch lane.lane { + case "agent", "proxy", "service": + serviceName = name + } + status := statuses[serviceName] + if strings.TrimSpace(status.Status) == "" { + status = unknownStatus(serviceName) + } + if serviceName == "" { + status.Status = "n/a" + status.Uptime = "-" + } + + node := topologyNode{ + ID: topologyNodeID(lane.lane, name), + Label: name, + Lane: lane.lane, + ServiceName: serviceName, + X: laneX[lane.lane], + Y: yStart + row*rowGap, + Width: nodeW, + Height: nodeH, + Status: status.Status, + StatusClass: statusClass(status.Status), + } + nodes = append(nodes, node) + nodeMap[nodeKey(lane.lane, name)] = node + } + } + + neighborMap := make(map[string]map[string]struct{}) + edges := make([]topologyEdge, 0) + seenEdges := make(map[string]struct{}) + for _, edge := range edgeDefs { + from, okFrom := nodeMap[nodeKey(edge.fromLane, edge.fromName)] + to, okTo := nodeMap[nodeKey(edge.toLane, edge.toName)] + if !okFrom || !okTo { + continue + } + + key := from.ID + ">" + to.ID + ":" + edge.kind + if _, exists := seenEdges[key]; exists { + continue + } + seenEdges[key] = struct{}{} + + x1 := from.X + from.Width + y1 := from.Y + from.Height/2 + x2 := to.X + y2 := to.Y + to.Height/2 + mid := (x1 + x2) / 2 + + edges = append(edges, topologyEdge{ + FromID: from.ID, + ToID: to.ID, + Path: fmt.Sprintf("M %d %d C %d %d, %d %d, %d %d", x1, y1, mid, y1, mid, y2, x2, y2), + Color: topologyEdgeColor(edge.kind), + }) + + if neighborMap[from.ID] == nil { + neighborMap[from.ID] = map[string]struct{}{} + } + if neighborMap[to.ID] == nil { + neighborMap[to.ID] = map[string]struct{}{} + } + neighborMap[from.ID][to.ID] = struct{}{} + neighborMap[to.ID][from.ID] = struct{}{} + } + + for i := range nodes { + neighbors := sortedSet(neighborMap[nodes[i].ID]) + nodes[i].Neighbors = strings.Join(neighbors, ",") + } + + canvasWidth := xStart + laneGap*(len(lanesMeta)-1) + nodeW + canvasPad + canvasHeight := yStart + maxRows*rowGap + canvasPad + if canvasHeight < 300 { + canvasHeight = 300 + } + + return topologyPageData{ + PodName: manifest.PodName, + ActiveTab: "topology", + Lanes: lanesMeta, + CanvasWidth: canvasWidth, + CanvasHeight: canvasHeight, + Nodes: nodes, + Edges: edges, + HasNodes: len(nodes) > 0, + HasCllama: hasCllama, + StatusError: statusErr, + HasStatusErrors: statusErr != "", + } +} + +func topologyNodeID(lane, name string) string { + safe := strings.ToLower(strings.TrimSpace(name)) + replacer := strings.NewReplacer(" ", "-", "/", "-", ":", "-", ".", "-", "_", "-") + safe = replacer.Replace(safe) + return lane + "-" + safe +} + +func nodeKey(lane, name string) string { + return lane + "|" + name +} + +func topologyEdgeColor(kind string) string { + switch kind { + case "channel": + return "var(--cyan)" + case "service": + return "var(--amber)" + case "volume": + return "var(--green)" + case "proxy": + return "var(--purple)" + default: + return "var(--line-bright)" + } +} + +func sortedSet(set map[string]struct{}) []string { + out := make([]string, 0, len(set)) + for v := range set { + out = append(out, v) + } + sort.Strings(out) + return out +} + +func uniqueSorted(items []string) []string { + set := make(map[string]struct{}, len(items)) + for _, item := range items { + item = strings.TrimSpace(item) + if item == "" { + continue + } + set[item] = struct{}{} + } + return sortedSet(set) +} diff --git a/dockerfiles/clawdash/Dockerfile b/dockerfiles/clawdash/Dockerfile new file mode 100644 index 0000000..2f6ed1e --- /dev/null +++ b/dockerfiles/clawdash/Dockerfile @@ -0,0 +1,13 @@ +FROM golang:1.23 AS build +WORKDIR /src +COPY go.mod go.sum* ./ +RUN go mod download 2>/dev/null || true +COPY . . +RUN CGO_ENABLED=0 go build -o /clawdash ./cmd/clawdash + +FROM gcr.io/distroless/static-debian12 +COPY --from=build /clawdash /clawdash +EXPOSE 8082 +HEALTHCHECK --interval=15s --timeout=5s --retries=3 \ + CMD ["/clawdash", "-healthcheck"] +ENTRYPOINT ["/clawdash"] diff --git a/docs/plans/2026-02-26-cllama-cost-hooks.md b/docs/plans/2026-02-26-cllama-cost-hooks.md index 9b901b7..0fea657 100644 --- a/docs/plans/2026-02-26-cllama-cost-hooks.md +++ b/docs/plans/2026-02-26-cllama-cost-hooks.md @@ -6,6 +6,8 @@ **Architecture:** The proxy already intercepts every LLM request and response. Cost hooks read the `usage` block from OpenAI-compatible responses, multiply by a pricing table, and aggregate in-memory per agent. A new `internal/cost/` package owns the pricing table and accumulator. The logger gains `tokens_in`, `tokens_out`, `cost_usd` fields. The UI gains a `/costs` page. No persistent storage — costs reset on proxy restart (persistence is a future concern; structured logs are the durable record). +**Emission contract for fleet dashboards:** `GET /costs/api` is the stable machine-readable interface for downstream dashboards (including Clawdapus Dash). UI routing (`/` vs `/costs`) may evolve, but `/costs/api` should remain stable and versioned when shape changes. + **Tech Stack:** Go 1.23, `sync` (thread-safe accumulator), `encoding/json` (response parsing), `html/template` (UI), `time` (windowed stats) **Repo:** `mostlydev/cllama-passthrough` (at `/Users/wojtek/dev/ai/clawdapus/cllama-passthrough`) diff --git a/docs/plans/2026-02-28-clawdash-dashboard-design.md b/docs/plans/2026-02-28-clawdash-dashboard-design.md new file mode 100644 index 0000000..9c4d95c --- /dev/null +++ b/docs/plans/2026-02-28-clawdash-dashboard-design.md @@ -0,0 +1,202 @@ +# Clawdapus Dash (`clawdash`) — Fleet Dashboard Design + +**Date:** 2026-02-28 +**Status:** IMPLEMENTED + +## Overview + +Clawdapus Dash (`clawdash`) is a standalone operator-facing container auto-injected into every Clawdapus pod. It provides fleet-level observability: a single pane of glass showing all agents, services, cllama proxies, surfaces, and their live health status. + +## Architecture + +### Container Model + +- **Service name:** `clawdash` +- **Image:** `ghcr.io/mostlydev/clawdash:latest` (Go binary, embedded HTML templates) +- **Port:** `:8082` (configurable via `CLAWDASH_ADDR`) +- **Hardening:** `read_only: true`, `tmpfs: [/tmp]`, `restart: on-failure` — same as all claw services +- **Network:** `claw-internal` +- **Labels:** `claw.pod: `, `claw.role: dashboard` + +### Data Sources + +Two complementary inputs: + +1. **`/claw/pod-manifest.json`** (read-only bind mount) — Generated by `compose_up.go` during the materialize pass. Contains the full resolved pod topology as a static snapshot: + - Per-service: name, clawType, imageRef, agent file, models, count + - Handles: platform, ID, username, guilds (with channels) + - Surfaces: scheme, target, accessMode, ports, channelConfig + - Skills: name list + - Invocations: schedule, message, to, name + - Cllama: proxy types, proxy service names + - Peer relationships: peerHandles map + +2. **`/var/run/docker.sock`** (read-only bind mount) — Live container status via Docker API: + - Container state: running, stopped, restarting + - Health check results: healthy, unhealthy, starting + - Uptime duration + - No exec, no lifecycle ops — strictly read-only + +### Injection into Compose + +Same pattern as cllama sidecar injection in `compose_emit.go`. A new `ClawdashConfig` struct passed to `EmitCompose`, which adds the `clawdash` service entry to `compose.generated.yml`. Always injected when the pod has any `x-claw` services (same `hasClaw` gate). + +### Manifest Generation + +During `claw up`, after Pass 1 (inspect + resolve all services) but before Pass 2 (materialize), `compose_up.go` serializes the resolved pod state into `pod-manifest.json` in the runtime dir. This is a JSON serialization of: + +```go +type PodManifest struct { + PodName string `json:"podName"` + Services map[string]ServiceManifest `json:"services"` + Proxies []ProxyManifest `json:"proxies"` +} + +type ServiceManifest struct { + ClawType string `json:"clawType"` + ImageRef string `json:"imageRef"` + Agent string `json:"agent"` + Models map[string]string `json:"models,omitempty"` + Count int `json:"count"` + Handles map[string]*driver.HandleInfo `json:"handles,omitempty"` + Surfaces []SurfaceManifest `json:"surfaces,omitempty"` + Skills []string `json:"skills,omitempty"` + Invocations []driver.Invocation `json:"invocations,omitempty"` + Cllama []string `json:"cllama,omitempty"` +} + +type SurfaceManifest struct { + Scheme string `json:"scheme"` + Target string `json:"target"` + AccessMode string `json:"accessMode,omitempty"` + Ports []string `json:"ports,omitempty"` +} + +type ProxyManifest struct { + ProxyType string `json:"proxyType"` + ServiceName string `json:"serviceName"` + Image string `json:"image"` +} +``` + +Skills are serialized as name-only (no host paths — those are meaningless inside the dashboard container). + +## Pages + +### 1. Fleet Overview (`/`) + +The home page. A card grid showing every service in the pod. + +**Card grouping** (section headers): +- **Agents** — services with a clawType (openclaw, nanoclaw, etc.) +- **Proxies** — cllama services +- **Infrastructure** — non-claw services (databases, caches, etc.) + +**Agent card contents:** +- Service name (bold) + claw type badge (small pill) +- Health status: green/amber/red dot + status text (healthy, unhealthy, starting, stopped) +- Model: primary model slot value (e.g. `anthropic/claude-sonnet-4-20250514`) +- Handles: platform icon(s) + username (e.g. Discord icon + `@fleet-bot`) +- Cllama indicator: proxy type badge if proxied (e.g. `passthrough`) +- Uptime: human-readable duration +- Count: ordinal badge if count > 1 (e.g. `x3`) + +**Proxy card contents:** +- Service name + `proxy` role badge +- Health dot + status +- Proxy type +- Uptime + +**Infrastructure card contents:** +- Service name + `native` badge +- Health dot (from Docker healthcheck if configured) +- Uptime + +**Click action:** Navigates to `/detail/` + +### 2. Topology (`/topology`) + +Layered column wiring diagram showing how pod components connect. + +**Five swim lanes (left to right):** + +``` +Channels │ Agents │ Proxies │ Services │ Volumes +──────────┼──────────┼───────────┼────────────┼───────── +discord ─── bot-a ─── cllama │ │ shared-data +discord ─── bot-b ─┘ ─── postgres │ + └── bot-c │ └ workspace +``` + +**Implementation:** +- HTML nodes positioned in CSS grid columns +- SVG `` or `` elements for connections, drawn between node edges +- Color-coded by surface scheme: + - Cyan (`--cyan`) for channel surfaces + - Amber (`--amber`) for service surfaces + - Green (`--green`) for volume/host surfaces + - Purple (`--purple`) for cllama proxy links +- Hover: highlighting a node dims all unconnected nodes/lines (CSS opacity transition) +- Health dots on each node (same as fleet cards) + +**Data flow:** Derived entirely from `pod-manifest.json` — surfaces define the edges, services define the nodes. Proxies appear in the middle column based on each agent's `cllama` field. Channels/services/volumes appear based on surface scheme. + +**No JS framework.** Pure HTML + inline SVG + CSS. Connection line coordinates computed server-side in Go template rendering (node positions are deterministic from the grid layout). + +### 3. Detail (`/detail/:service`) + +Drill-down view for a single service. Sections rendered as collapsible panels: + +| Section | Content | +|---------|---------| +| **Status** | Health dot, status text, uptime, container ID (short), image ref | +| **Surfaces** | Table: scheme, target, access mode, ports | +| **Handles** | Table: platform, username, ID, guilds (expandable with channels) | +| **Skills** | Bulleted list of skill names | +| **Invoke** | Table: name, cron schedule, message (truncated), target channel | +| **Models** | Table: slot, provider/model | +| **Cllama** | Proxy type(s), proxy service name, token status (present/absent — never shown) | + +## Visual Design + +Inherits cllama's design language for visual cohesion across the pod's operator tools: + +- **Fonts:** Geist Mono (monospace, labels/badges), Outfit (sans-serif, body text) +- **Color palette:** Same CSS variables — `--bg: #0c1017`, `--cyan: #22d3ee`, `--amber: #f0a500`, `--green: #34d399`, `--red: #ef4444`, `--purple: #a78bfa` +- **Scan-line overlay:** Same subtle `repeating-linear-gradient` texture +- **Top bar:** `CLAWDAPUS DASH` brand in Geist Mono uppercase, nav tabs (Fleet / Topology / Detail) +- **Cards:** `--bg-raised` background, `--line` border, subtle hover glow +- **Badges/pills:** Rounded, small, colored by type (cyan for claw types, amber for proxy, green for native) + +## Live Updates + +Smooth partial refresh without full-page reloads: + +- A small `