From 9f4a5468fb3adf2b84b1042b64fa30e983050ca4 Mon Sep 17 00:00:00 2001 From: Harshal Patil <12152047+harche@users.noreply.github.com> Date: Tue, 7 Oct 2025 11:01:37 -0400 Subject: [PATCH] Add nodes_debug_exec tool in pkg/ocp package --- README.md | 7 + pkg/config/config.go | 2 +- pkg/config/config_test.go | 4 +- pkg/kubernetes-mcp-server/cmd/root_test.go | 4 +- pkg/mcp/modules.go | 1 + pkg/mcp/nodes_test.go | 265 +++++++++++++ ...toolsets-full-tools-multicluster-enum.json | 52 +++ .../toolsets-full-tools-multicluster.json | 48 +++ .../toolsets-full-tools-openshift.json | 44 +++ pkg/mcp/testdata/toolsets-full-tools.json | 44 +++ .../toolsets-openshift-core-tools.json | 46 +++ pkg/ocp/nodes_debug.go | 307 +++++++++++++++ pkg/ocp/nodes_debug_test.go | 360 ++++++++++++++++++ pkg/ocp/ocp_client.go | 44 +++ pkg/ocp/testhelpers.go | 163 ++++++++ pkg/toolsets/openshift/nodes.go | 126 ++++++ pkg/toolsets/openshift/nodes_test.go | 95 +++++ pkg/toolsets/openshift/toolset.go | 31 ++ 18 files changed, 1638 insertions(+), 5 deletions(-) create mode 100644 pkg/mcp/nodes_test.go create mode 100644 pkg/mcp/testdata/toolsets-openshift-core-tools.json create mode 100644 pkg/ocp/nodes_debug.go create mode 100644 pkg/ocp/nodes_debug_test.go create mode 100644 pkg/ocp/ocp_client.go create mode 100644 pkg/ocp/testhelpers.go create mode 100644 pkg/toolsets/openshift/nodes.go create mode 100644 pkg/toolsets/openshift/nodes_test.go create mode 100644 pkg/toolsets/openshift/toolset.go diff --git a/README.md b/README.md index 600f4981..a95120cd 100644 --- a/README.md +++ b/README.md @@ -235,6 +235,13 @@ In case multi-cluster support is enabled (default) and you have access to multip - **projects_list** - List all the OpenShift projects in the current cluster +- **nodes_debug_exec** - Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). Commands execute in a chroot of the host filesystem, providing full access to node-level diagnostics. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs. + - `command` (`array`) **(required)** - Command to execute on the node via chroot. All standard debugging utilities are available including systemctl, journalctl, ss, ip, ping, traceroute, nmap, ps, top, lsof, strace, find, tar, rsync, gdb, and more. Provide each argument as a separate array item (e.g. ['systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']). + - `image` (`string`) - Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities. + - `namespace` (`string`) - Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default'). + - `node` (`string`) **(required)** - Name of the node to debug (e.g. worker-0). + - `timeout_seconds` (`integer`) - Maximum time to wait for the command to complete before timing out (optional, defaults to 300 seconds). + - **pods_list** - List all the Kubernetes pods in the current cluster from all namespaces - `labelSelector` (`string`) - Optional Kubernetes label selector (e.g. 'app=myapp,env=prod' or 'app in (myapp,yourapp)'), use this option when you want to filter the pods by label diff --git a/pkg/config/config.go b/pkg/config/config.go index 3fb2428e..3b033b90 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -66,7 +66,7 @@ type StaticConfig struct { func Default() *StaticConfig { return &StaticConfig{ ListOutput: "table", - Toolsets: []string{"core", "config", "helm"}, + Toolsets: []string{"core", "config", "helm", "openshift-core"}, } } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index b498548d..2b046e8a 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -152,8 +152,8 @@ func (s *ConfigSuite) TestReadConfigValidPreservesDefaultsForMissingFields() { s.Equalf("table", config.ListOutput, "Expected ListOutput to be table, got %s", config.ListOutput) }) s.Run("toolsets defaulted correctly", func() { - s.Require().Lenf(config.Toolsets, 3, "Expected 3 toolsets, got %d", len(config.Toolsets)) - for _, toolset := range []string{"core", "config", "helm"} { + s.Require().Lenf(config.Toolsets, 4, "Expected 4 toolsets, got %d", len(config.Toolsets)) + for _, toolset := range []string{"core", "config", "helm", "openshift-core"} { s.Containsf(config.Toolsets, toolset, "Expected toolsets to contain %s", toolset) } }) diff --git a/pkg/kubernetes-mcp-server/cmd/root_test.go b/pkg/kubernetes-mcp-server/cmd/root_test.go index 22521667..e5342cd5 100644 --- a/pkg/kubernetes-mcp-server/cmd/root_test.go +++ b/pkg/kubernetes-mcp-server/cmd/root_test.go @@ -137,7 +137,7 @@ func TestToolsets(t *testing.T) { rootCmd := NewMCPServer(ioStreams) rootCmd.SetArgs([]string{"--help"}) o, err := captureOutput(rootCmd.Execute) // --help doesn't use logger/klog, cobra prints directly to stdout - if !strings.Contains(o, "Comma-separated list of MCP toolsets to use (available toolsets: config, core, helm).") { + if !strings.Contains(o, "Comma-separated list of MCP toolsets to use (available toolsets: config, core, helm, openshift-core).") { t.Fatalf("Expected all available toolsets, got %s %v", o, err) } }) @@ -145,7 +145,7 @@ func TestToolsets(t *testing.T) { ioStreams, out := testStream() rootCmd := NewMCPServer(ioStreams) rootCmd.SetArgs([]string{"--version", "--port=1337", "--log-level=1"}) - if err := rootCmd.Execute(); !strings.Contains(out.String(), "- Toolsets: core, config, helm") { + if err := rootCmd.Execute(); !strings.Contains(out.String(), "- Toolsets: core, config, helm, openshift-core") { t.Fatalf("Expected toolsets 'full', got %s %v", out, err) } }) diff --git a/pkg/mcp/modules.go b/pkg/mcp/modules.go index 3295d72b..b1391489 100644 --- a/pkg/mcp/modules.go +++ b/pkg/mcp/modules.go @@ -3,3 +3,4 @@ package mcp import _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/config" import _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/core" import _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/helm" +import _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift" diff --git a/pkg/mcp/nodes_test.go b/pkg/mcp/nodes_test.go new file mode 100644 index 00000000..4efc83ff --- /dev/null +++ b/pkg/mcp/nodes_test.go @@ -0,0 +1,265 @@ +package mcp + +import ( + "encoding/json" + "io" + "net/http" + "strings" + "testing" + + "github.com/mark3labs/mcp-go/mcp" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" + + "github.com/containers/kubernetes-mcp-server/internal/test" +) + +func TestNodesDebugExecTool(t *testing.T) { + testCase(t, func(c *mcpContext) { + mockServer := test.NewMockServer() + defer mockServer.Close() + c.withKubeConfig(mockServer.Config()) + + var ( + createdPod v1.Pod + deleteCalled bool + ) + const namespace = "debug" + const logOutput = "filesystem repaired" + + scheme := runtime.NewScheme() + _ = v1.AddToScheme(scheme) + codec := serializer.NewCodecFactory(scheme).UniversalDeserializer() + + mockServer.Handle(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + switch { + case req.URL.Path == "/api": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIVersions","versions":["v1"],"serverAddressByClientCIDRs":[{"clientCIDR":"0.0.0.0/0"}]}`)) + case req.URL.Path == "/apis": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIGroupList","apiVersion":"v1","groups":[]}`)) + case req.URL.Path == "/api/v1": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIResourceList","apiVersion":"v1","resources":[{"name":"pods","singularName":"","namespaced":true,"kind":"Pod","verbs":["get","list","watch","create","update","patch","delete"]}]}`)) + case req.Method == http.MethodPatch && strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/"): + // Handle server-side apply (PATCH with fieldManager query param) + body, err := io.ReadAll(req.Body) + if err != nil { + t.Fatalf("failed to read apply body: %v", err) + } + created := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, created); err != nil { + t.Fatalf("failed to decode apply body: %v", err) + } + createdPod = *created + // Keep the name from the request URL if it was provided + pathParts := strings.Split(req.URL.Path, "/") + if len(pathParts) > 0 { + createdPod.Name = pathParts[len(pathParts)-1] + } + createdPod.Namespace = namespace + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(&createdPod) + case req.Method == http.MethodPost && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods": + body, err := io.ReadAll(req.Body) + if err != nil { + t.Fatalf("failed to read create body: %v", err) + } + created := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, created); err != nil { + t.Fatalf("failed to decode create body: %v", err) + } + createdPod = *created + createdPod.ObjectMeta = metav1.ObjectMeta{ + Namespace: namespace, + Name: createdPod.GenerateName + "abc", + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(&createdPod) + case req.Method == http.MethodGet && createdPod.Name != "" && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods/"+createdPod.Name: + podStatus := createdPod.DeepCopy() + podStatus.Status = v1.PodStatus{ + Phase: v1.PodSucceeded, + ContainerStatuses: []v1.ContainerStatus{{ + Name: "debug", + State: v1.ContainerState{Terminated: &v1.ContainerStateTerminated{ + ExitCode: 0, + }}, + }}, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(podStatus) + case req.Method == http.MethodDelete && createdPod.Name != "" && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods/"+createdPod.Name: + deleteCalled = true + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(&metav1.Status{Status: "Success"}) + case req.Method == http.MethodGet && createdPod.Name != "" && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods/"+createdPod.Name+"/log": + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte(logOutput)) + } + })) + + toolResult, err := c.callTool("nodes_debug_exec", map[string]any{ + "node": "worker-0", + "namespace": namespace, + "command": []any{"uname", "-a"}, + }) + + t.Run("call succeeds", func(t *testing.T) { + if err != nil { + t.Fatalf("call tool failed: %v", err) + } + if toolResult.IsError { + t.Fatalf("tool returned error: %v", toolResult.Content) + } + if len(toolResult.Content) == 0 { + t.Fatalf("expected output content") + } + text := toolResult.Content[0].(mcp.TextContent).Text + if text != logOutput { + t.Fatalf("unexpected tool output %q", text) + } + }) + + t.Run("debug pod shaped correctly", func(t *testing.T) { + if createdPod.Spec.Containers == nil || len(createdPod.Spec.Containers) != 1 { + t.Fatalf("expected single container in debug pod") + } + container := createdPod.Spec.Containers[0] + expectedPrefix := []string{"chroot", "/host", "uname", "-a"} + if !equalStringSlices(container.Command, expectedPrefix) { + t.Fatalf("unexpected debug command: %v", container.Command) + } + if container.SecurityContext == nil || container.SecurityContext.Privileged == nil || !*container.SecurityContext.Privileged { + t.Fatalf("expected privileged container") + } + if len(createdPod.Spec.Volumes) == 0 || createdPod.Spec.Volumes[0].HostPath == nil { + t.Fatalf("expected hostPath volume on debug pod") + } + if !deleteCalled { + t.Fatalf("expected debug pod to be deleted") + } + }) + }) +} + +func equalStringSlices(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +func TestNodesDebugExecToolNonZeroExit(t *testing.T) { + testCase(t, func(c *mcpContext) { + mockServer := test.NewMockServer() + defer mockServer.Close() + c.withKubeConfig(mockServer.Config()) + + const namespace = "default" + const errorMessage = "failed" + + scheme := runtime.NewScheme() + _ = v1.AddToScheme(scheme) + codec := serializer.NewCodecFactory(scheme).UniversalDeserializer() + + mockServer.Handle(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + switch { + case req.URL.Path == "/api": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIVersions","versions":["v1"],"serverAddressByClientCIDRs":[{"clientCIDR":"0.0.0.0/0"}]}`)) + case req.URL.Path == "/apis": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIGroupList","apiVersion":"v1","groups":[]}`)) + case req.URL.Path == "/api/v1": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIResourceList","apiVersion":"v1","resources":[{"name":"pods","singularName":"","namespaced":true,"kind":"Pod","verbs":["get","list","watch","create","update","patch","delete"]}]}`)) + case req.Method == http.MethodPatch && strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/"): + // Handle server-side apply (PATCH with fieldManager query param) + body, err := io.ReadAll(req.Body) + if err != nil { + t.Fatalf("failed to read apply body: %v", err) + } + pod := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, pod); err != nil { + t.Fatalf("failed to decode apply body: %v", err) + } + // Keep the name from the request URL if it was provided + pathParts := strings.Split(req.URL.Path, "/") + if len(pathParts) > 0 { + pod.Name = pathParts[len(pathParts)-1] + } + pod.Namespace = namespace + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(pod) + case req.Method == http.MethodPost && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods": + body, err := io.ReadAll(req.Body) + if err != nil { + t.Fatalf("failed to read create body: %v", err) + } + pod := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, pod); err != nil { + t.Fatalf("failed to decode create body: %v", err) + } + pod.ObjectMeta = metav1.ObjectMeta{Name: pod.GenerateName + "xyz", Namespace: namespace} + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(pod) + case strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/") && strings.HasSuffix(req.URL.Path, "/log"): + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte(errorMessage)) + case req.Method == http.MethodGet && strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/"): + pathParts := strings.Split(req.URL.Path, "/") + podName := pathParts[len(pathParts)-1] + pod := &v1.Pod{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Pod", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: namespace, + }, + } + pod.Status = v1.PodStatus{ + Phase: v1.PodSucceeded, + ContainerStatuses: []v1.ContainerStatus{{ + Name: "debug", + State: v1.ContainerState{Terminated: &v1.ContainerStateTerminated{ + ExitCode: 2, + Reason: "Error", + }}, + }}, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(pod) + } + })) + + toolResult, err := c.callTool("nodes_debug_exec", map[string]any{ + "node": "infra-1", + "command": []any{"journalctl"}, + }) + + if err != nil { + t.Fatalf("call tool failed: %v", err) + } + if !toolResult.IsError { + t.Fatalf("expected tool to return error") + } + text := toolResult.Content[0].(mcp.TextContent).Text + if !strings.Contains(text, "command exited with code 2") { + t.Fatalf("expected exit code message, got %q", text) + } + if !strings.Contains(text, "Error") { + t.Fatalf("expected error reason included, got %q", text) + } + }) +} diff --git a/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json b/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json index 97af6fb5..4c5a8549 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json +++ b/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json @@ -195,6 +195,58 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). Commands execute in a chroot of the host filesystem, providing full access to node-level diagnostics. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node via chroot. All standard debugging utilities are available including systemctl, journalctl, ss, ip, ping, traceroute, nmap, ps, top, lsof, strace, find, tar, rsync, gdb, and more. Provide each argument as a separate array item (e.g. ['systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "context": { + "description": "Optional parameter selecting which context to run the tool in. Defaults to fake-context if not set", + "enum": [ + "extra-cluster", + "fake-context" + ], + "type": "string" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 300 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Pods: Delete", diff --git a/pkg/mcp/testdata/toolsets-full-tools-multicluster.json b/pkg/mcp/testdata/toolsets-full-tools-multicluster.json index 861a1b5a..b1558e61 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-multicluster.json +++ b/pkg/mcp/testdata/toolsets-full-tools-multicluster.json @@ -175,6 +175,54 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). Commands execute in a chroot of the host filesystem, providing full access to node-level diagnostics. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node via chroot. All standard debugging utilities are available including systemctl, journalctl, ss, ip, ping, traceroute, nmap, ps, top, lsof, strace, find, tar, rsync, gdb, and more. Provide each argument as a separate array item (e.g. ['systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "context": { + "description": "Optional parameter selecting which context to run the tool in. Defaults to fake-context if not set", + "type": "string" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 300 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Pods: Delete", diff --git a/pkg/mcp/testdata/toolsets-full-tools-openshift.json b/pkg/mcp/testdata/toolsets-full-tools-openshift.json index b5018945..f34771f2 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-openshift.json +++ b/pkg/mcp/testdata/toolsets-full-tools-openshift.json @@ -139,6 +139,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). Commands execute in a chroot of the host filesystem, providing full access to node-level diagnostics. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node via chroot. All standard debugging utilities are available including systemctl, journalctl, ss, ip, ping, traceroute, nmap, ps, top, lsof, strace, find, tar, rsync, gdb, and more. Provide each argument as a separate array item (e.g. ['systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 300 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Pods: Delete", diff --git a/pkg/mcp/testdata/toolsets-full-tools.json b/pkg/mcp/testdata/toolsets-full-tools.json index 7b9f471d..4e8ca6dc 100644 --- a/pkg/mcp/testdata/toolsets-full-tools.json +++ b/pkg/mcp/testdata/toolsets-full-tools.json @@ -139,6 +139,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). Commands execute in a chroot of the host filesystem, providing full access to node-level diagnostics. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node via chroot. All standard debugging utilities are available including systemctl, journalctl, ss, ip, ping, traceroute, nmap, ps, top, lsof, strace, find, tar, rsync, gdb, and more. Provide each argument as a separate array item (e.g. ['systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 300 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Pods: Delete", diff --git a/pkg/mcp/testdata/toolsets-openshift-core-tools.json b/pkg/mcp/testdata/toolsets-openshift-core-tools.json new file mode 100644 index 00000000..977563d3 --- /dev/null +++ b/pkg/mcp/testdata/toolsets-openshift-core-tools.json @@ -0,0 +1,46 @@ +[ + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). Commands execute in a chroot of the host filesystem, providing full access to node-level diagnostics. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node via chroot. All standard debugging utilities are available including systemctl, journalctl, ss, ip, ping, traceroute, nmap, ps, top, lsof, strace, find, tar, rsync, gdb, and more. Provide each argument as a separate array item (e.g. ['systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 300 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + } +] diff --git a/pkg/ocp/nodes_debug.go b/pkg/ocp/nodes_debug.go new file mode 100644 index 00000000..ee2e5293 --- /dev/null +++ b/pkg/ocp/nodes_debug.go @@ -0,0 +1,307 @@ +package ocp + +import ( + "context" + "errors" + "fmt" + "strings" + "time" + + "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + "github.com/containers/kubernetes-mcp-server/pkg/version" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" +) + +const ( + // DefaultNodeDebugImage is the UBI9 toolbox image that provides comprehensive debugging and troubleshooting utilities. + // This image includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), + // process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), debugging tools (gdb), + // and many other utilities commonly needed for node-level debugging and diagnostics. + DefaultNodeDebugImage = "registry.access.redhat.com/ubi9/toolbox:latest" + // NodeDebugContainerName is the name used for the debug container, matching oc debug defaults. + NodeDebugContainerName = "debug" + // DefaultNodeDebugTimeout is the maximum time to wait for the debug pod to finish executing. + DefaultNodeDebugTimeout = 1 * time.Minute +) + +// NodesDebugExec mimics `oc debug node/ -- ` by creating a privileged pod on the target +// node, running the provided command within a chroot of the host filesystem, collecting its output, and +// removing the pod afterwards. +// +// When namespace is empty, the configured namespace (or "default" if none) is used. When image is empty the +// default debug image is used. Timeout controls how long we wait for the pod to complete. +func NodesDebugExec( + ctx context.Context, + k OpenshiftClient, + namespace string, + nodeName string, + image string, + command []string, + timeout time.Duration, +) (string, error) { + if nodeName == "" { + return "", errors.New("node name is required") + } + if len(command) == 0 { + return "", errors.New("command is required") + } + + ns := k.NamespaceOrDefault(namespace) + if ns == "" { + ns = "default" + } + debugImage := image + if debugImage == "" { + debugImage = DefaultNodeDebugImage + } + if timeout <= 0 { + timeout = DefaultNodeDebugTimeout + } + + // Create the debug pod + created, err := createDebugPod(ctx, k, nodeName, ns, debugImage, command) + if err != nil { + return "", err + } + + // Ensure the pod is deleted regardless of completion state. + defer func() { + deleteCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + pods, err := k.AccessControlClientset().Pods(ns) + if err == nil { + _ = pods.Delete(deleteCtx, created.Name, metav1.DeleteOptions{}) + } + }() + + // Poll for debug pod completion + terminated, lastPod, waitMsg, err := pollForCompletion(ctx, k, ns, created.Name, timeout) + if err != nil { + return "", err + } + + // Retrieve the logs + logs, err := retrieveLogs(ctx, k, ns, created.Name) + if err != nil { + return "", err + } + + // Process the results + return processResults(terminated, lastPod, waitMsg, logs) +} + +// createDebugPod creates a privileged pod on the target node to run debug commands. +func createDebugPod( + ctx context.Context, + k OpenshiftClient, + nodeName string, + namespace string, + image string, + command []string, +) (*corev1.Pod, error) { + sanitizedNode := sanitizeForName(nodeName) + hostPathType := corev1.HostPathDirectory + + // Generate a unique name + podName := fmt.Sprintf("node-debug-%s-%d", sanitizedNode, time.Now().UnixNano()) + + debugPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: namespace, + Labels: map[string]string{ + kubernetes.AppKubernetesManagedBy: version.BinaryName, + kubernetes.AppKubernetesComponent: "node-debug", + kubernetes.AppKubernetesName: fmt.Sprintf("node-debug-%s", sanitizedNode), + }, + }, + Spec: corev1.PodSpec{ + AutomountServiceAccountToken: ptr.To(false), + NodeName: nodeName, + RestartPolicy: corev1.RestartPolicyNever, + SecurityContext: &corev1.PodSecurityContext{ + RunAsUser: ptr.To[int64](0), + }, + Tolerations: []corev1.Toleration{ + {Operator: corev1.TolerationOpExists}, + {Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}, + {Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoExecute}, + }, + Volumes: []corev1.Volume{ + { + Name: "host-root", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/", + Type: &hostPathType, + }, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: NodeDebugContainerName, + Image: image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: append([]string{"chroot", "/host"}, command...), + SecurityContext: &corev1.SecurityContext{ + Privileged: ptr.To(true), + RunAsUser: ptr.To[int64](0), + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "host-root", MountPath: "/host"}, + }, + }, + }, + }, + } + + // Create the pod using AccessControlClientset + pods, err := k.AccessControlClientset().Pods(namespace) + if err != nil { + return nil, fmt.Errorf("failed to get pods interface: %w", err) + } + + created, err := pods.Create(ctx, debugPod, metav1.CreateOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create debug pod: %w", err) + } + + return created, nil +} + +// pollForCompletion polls the debug pod until it completes or times out. +func pollForCompletion( + ctx context.Context, + k OpenshiftClient, + namespace string, + podName string, + timeout time.Duration, +) (*corev1.ContainerStateTerminated, *corev1.Pod, string, error) { + pollCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + var ( + lastPod *corev1.Pod + terminated *corev1.ContainerStateTerminated + waitMsg string + ) + + for { + select { + case <-pollCtx.Done(): + return nil, nil, "", fmt.Errorf("timed out waiting for debug pod %s to complete: %w", podName, pollCtx.Err()) + default: + } + + // Get pod status using AccessControlClientset + pods, getErr := k.AccessControlClientset().Pods(namespace) + if getErr != nil { + return nil, nil, "", fmt.Errorf("failed to get pods interface: %w", getErr) + } + + current, err := pods.Get(pollCtx, podName, metav1.GetOptions{}) + if err != nil { + return nil, nil, "", fmt.Errorf("failed to get debug pod status: %w", err) + } + lastPod = current + + if status := containerStatusByName(current.Status.ContainerStatuses, NodeDebugContainerName); status != nil { + if status.State.Waiting != nil { + waitMsg = fmt.Sprintf("container waiting: %s", status.State.Waiting.Reason) + // Image pull issues should fail fast. + if status.State.Waiting.Reason == "ErrImagePull" || status.State.Waiting.Reason == "ImagePullBackOff" { + return nil, nil, "", fmt.Errorf("debug container failed to start (%s): %s", status.State.Waiting.Reason, status.State.Waiting.Message) + } + } + if status.State.Terminated != nil { + terminated = status.State.Terminated + break + } + } + + if current.Status.Phase == corev1.PodFailed { + break + } + + select { + case <-pollCtx.Done(): + return nil, nil, "", fmt.Errorf("timed out waiting for debug pod %s to complete: %w", podName, pollCtx.Err()) + case <-ticker.C: + } + } + + return terminated, lastPod, waitMsg, nil +} + +// retrieveLogs retrieves the logs from the debug pod. +func retrieveLogs(ctx context.Context, k OpenshiftClient, namespace, podName string) (string, error) { + logCtx, logCancel := context.WithTimeout(ctx, 30*time.Second) + defer logCancel() + logs, logErr := k.PodsLog(logCtx, namespace, podName, NodeDebugContainerName, false, 0) + if logErr != nil { + return "", fmt.Errorf("failed to retrieve debug pod logs: %w", logErr) + } + return strings.TrimSpace(logs), nil +} + +// processResults processes the debug pod completion status and returns the appropriate result. +func processResults(terminated *corev1.ContainerStateTerminated, lastPod *corev1.Pod, waitMsg, logs string) (string, error) { + if terminated != nil { + if terminated.ExitCode != 0 { + errMsg := fmt.Sprintf("command exited with code %d", terminated.ExitCode) + if terminated.Reason != "" { + errMsg = fmt.Sprintf("%s (%s)", errMsg, terminated.Reason) + } + if terminated.Message != "" { + errMsg = fmt.Sprintf("%s: %s", errMsg, terminated.Message) + } + return logs, errors.New(errMsg) + } + return logs, nil + } + + if lastPod != nil && lastPod.Status.Reason != "" { + return logs, fmt.Errorf("debug pod failed: %s", lastPod.Status.Reason) + } + if waitMsg != "" { + return logs, fmt.Errorf("debug container did not complete: %s", waitMsg) + } + return logs, errors.New("debug container did not reach a terminal state") +} + +func sanitizeForName(name string) string { + lower := strings.ToLower(name) + var b strings.Builder + b.Grow(len(lower)) + for _, r := range lower { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' { + b.WriteRune(r) + continue + } + b.WriteRune('-') + } + sanitized := strings.Trim(b.String(), "-") + if sanitized == "" { + sanitized = "node" + } + if len(sanitized) > 40 { + sanitized = sanitized[:40] + } + return sanitized +} + +func containerStatusByName(statuses []corev1.ContainerStatus, name string) *corev1.ContainerStatus { + for idx := range statuses { + if statuses[idx].Name == name { + return &statuses[idx] + } + } + return nil +} diff --git a/pkg/ocp/nodes_debug_test.go b/pkg/ocp/nodes_debug_test.go new file mode 100644 index 00000000..2cee6b65 --- /dev/null +++ b/pkg/ocp/nodes_debug_test.go @@ -0,0 +1,360 @@ +package ocp + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" +) + +func TestNodesDebugExecCreatesPrivilegedChrootPod(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.Logs = "kernel 6.8" + + out, err := NodesDebugExec(context.Background(), env.Kubernetes, "", "worker-0", "", []string{"uname", "-a"}, 2*time.Minute) + if err != nil { + t.Fatalf("NodesDebugExec returned error: %v", err) + } + if out != "kernel 6.8" { + t.Fatalf("unexpected command output: %q", out) + } + + created := env.Pods.Created + if created == nil { + t.Fatalf("expected debug pod to be created") + } + if created.Namespace != "default" { + t.Fatalf("expected default namespace fallback, got %q", created.Namespace) + } + if created.Spec.NodeName != "worker-0" { + t.Fatalf("expected pod to target node worker-0, got %q", created.Spec.NodeName) + } + if !env.Pods.Deleted { + t.Fatalf("expected debug pod to be deleted after execution") + } + + if len(created.Spec.Containers) != 1 { + t.Fatalf("expected single container in debug pod") + } + container := created.Spec.Containers[0] + if container.Image != DefaultNodeDebugImage { + t.Fatalf("expected default image %q, got %q", DefaultNodeDebugImage, container.Image) + } + expectedCommand := []string{"chroot", "/host", "uname", "-a"} + if len(container.Command) != len(expectedCommand) { + t.Fatalf("unexpected command length, got %v", container.Command) + } + for i, part := range expectedCommand { + if container.Command[i] != part { + t.Fatalf("command[%d] = %q, expected %q", i, container.Command[i], part) + } + } + if container.SecurityContext == nil || container.SecurityContext.Privileged == nil || !*container.SecurityContext.Privileged { + t.Fatalf("expected container to run privileged") + } + if len(container.VolumeMounts) != 1 || container.VolumeMounts[0].MountPath != "/host" { + t.Fatalf("expected container to mount host root at /host") + } + + if created.Spec.SecurityContext == nil || created.Spec.SecurityContext.RunAsUser == nil || *created.Spec.SecurityContext.RunAsUser != 0 { + t.Fatalf("expected pod security context to run as root") + } + + if len(created.Spec.Volumes) != 1 || created.Spec.Volumes[0].HostPath == nil { + t.Fatalf("expected host root volume to be configured") + } +} + +func TestNodesDebugExecReturnsErrorForNonZeroExit(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.ExitCode = 5 + env.Pods.TerminatedReason = "Error" + env.Pods.TerminatedMessage = "some failure" + env.Pods.Logs = "bad things happened" + + out, err := NodesDebugExec(context.Background(), env.Kubernetes, "debug-ns", "infra-node", "registry.example/custom:latest", []string{"journalctl", "-xe"}, time.Minute) + if err == nil { + t.Fatalf("expected error for non-zero exit code") + } + if out != "bad things happened" { + t.Fatalf("expected logs to be returned alongside error, got %q", out) + } + + created := env.Pods.Created + if created == nil { + t.Fatalf("expected pod to be created") + } + if created.Namespace != "debug-ns" { + t.Fatalf("expected provided namespace to be used, got %q", created.Namespace) + } + if containerImage := created.Spec.Containers[0].Image; containerImage != "registry.example/custom:latest" { + t.Fatalf("expected custom image to be used, got %q", containerImage) + } +} + +func TestCreateDebugPod(t *testing.T) { + env := NewNodeDebugTestEnv(t) + + created, err := createDebugPod(context.Background(), env.Kubernetes, "worker-1", "test-ns", "custom:v1", []string{"ls", "-la"}) + if err != nil { + t.Fatalf("createDebugPod failed: %v", err) + } + if created == nil { + t.Fatalf("expected pod to be created") + } + if created.Namespace != "test-ns" { + t.Fatalf("expected namespace test-ns, got %q", created.Namespace) + } + if created.Spec.NodeName != "worker-1" { + t.Fatalf("expected node worker-1, got %q", created.Spec.NodeName) + } + if !strings.HasPrefix(created.Name, "node-debug-worker-1-") { + t.Fatalf("unexpected pod name: %q", created.Name) + } + if len(created.Spec.Containers) != 1 { + t.Fatalf("expected 1 container, got %d", len(created.Spec.Containers)) + } + container := created.Spec.Containers[0] + if container.Image != "custom:v1" { + t.Fatalf("expected image custom:v1, got %q", container.Image) + } + expectedCmd := []string{"chroot", "/host", "ls", "-la"} + if len(container.Command) != len(expectedCmd) { + t.Fatalf("expected %d command parts, got %d", len(expectedCmd), len(container.Command)) + } + for i, part := range expectedCmd { + if container.Command[i] != part { + t.Fatalf("command[%d] = %q, expected %q", i, container.Command[i], part) + } + } + if container.SecurityContext == nil || !*container.SecurityContext.Privileged { + t.Fatalf("expected privileged container") + } +} + +func TestPollForCompletion(t *testing.T) { + tests := []struct { + name string + exitCode int32 + terminatedReason string + waitingReason string + waitingMessage string + expectError bool + expectTerminated bool + errorContains []string + expectedExitCode int32 + expectedReason string + }{ + { + name: "successful completion", + exitCode: 0, + expectTerminated: true, + expectedExitCode: 0, + }, + { + name: "non-zero exit code", + exitCode: 42, + terminatedReason: "Error", + expectTerminated: true, + expectedExitCode: 42, + expectedReason: "Error", + }, + { + name: "image pull error", + waitingReason: "ErrImagePull", + waitingMessage: "image not found", + expectError: true, + errorContains: []string{"ErrImagePull", "image not found"}, + }, + { + name: "image pull backoff", + waitingReason: "ImagePullBackOff", + waitingMessage: "back-off pulling image", + expectError: true, + errorContains: []string{"ImagePullBackOff", "back-off pulling image"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.ExitCode = tt.exitCode + env.Pods.TerminatedReason = tt.terminatedReason + env.Pods.WaitingReason = tt.waitingReason + env.Pods.WaitingMessage = tt.waitingMessage + + created, _ := createDebugPod(context.Background(), env.Kubernetes, "node-1", "default", DefaultNodeDebugImage, []string{"echo", "test"}) + + terminated, lastPod, waitMsg, err := pollForCompletion(context.Background(), env.Kubernetes, "default", created.Name, time.Minute) + + if tt.expectError { + if err == nil { + t.Fatalf("expected error but got none") + } + for _, substr := range tt.errorContains { + if !strings.Contains(err.Error(), substr) { + t.Fatalf("expected error to contain %q, got: %v", substr, err) + } + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if tt.expectTerminated { + if terminated == nil { + t.Fatalf("expected terminated state") + } + if terminated.ExitCode != tt.expectedExitCode { + t.Fatalf("expected exit code %d, got %d", tt.expectedExitCode, terminated.ExitCode) + } + if tt.expectedReason != "" && terminated.Reason != tt.expectedReason { + t.Fatalf("expected reason %q, got %q", tt.expectedReason, terminated.Reason) + } + if lastPod == nil { + t.Fatalf("expected lastPod to be set") + } + } + + if tt.waitingReason == "" && waitMsg != "" { + t.Fatalf("expected no wait message, got %q", waitMsg) + } + }) + } +} + +func TestRetrieveLogs(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.Logs = " some output with whitespace \n" + + created, _ := createDebugPod(context.Background(), env.Kubernetes, "node-1", "default", DefaultNodeDebugImage, []string{"echo", "test"}) + + logs, err := retrieveLogs(context.Background(), env.Kubernetes, "default", created.Name) + if err != nil { + t.Fatalf("retrieveLogs failed: %v", err) + } + if logs != "some output with whitespace" { + t.Fatalf("expected trimmed logs, got %q", logs) + } +} + +func TestProcessResults(t *testing.T) { + tests := []struct { + name string + terminated *corev1.ContainerStateTerminated + pod *corev1.Pod + waitMsg string + logs string + expectError bool + errorContains []string + }{ + { + name: "successful completion", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 0, + }, + logs: "success output", + expectError: false, + }, + { + name: "non-zero exit code", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 127, + Reason: "CommandNotFound", + Message: "command not found", + }, + logs: "error logs", + expectError: true, + errorContains: []string{"127", "CommandNotFound", "command not found"}, + }, + { + name: "non-zero exit code without reason or message", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + }, + logs: "failed", + expectError: true, + errorContains: []string{"command exited with code 1"}, + }, + { + name: "pod failed", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Reason: "Evicted", + }, + }, + logs: "pod evicted", + expectError: true, + errorContains: []string{"Evicted"}, + }, + { + name: "container waiting", + waitMsg: "container waiting: ImagePullBackOff", + logs: "waiting logs", + expectError: true, + errorContains: []string{"did not complete"}, + }, + { + name: "no terminal state", + logs: "incomplete", + expectError: true, + errorContains: []string{"did not reach a terminal state"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := processResults(tt.terminated, tt.pod, tt.waitMsg, tt.logs) + + if tt.expectError { + if err == nil { + t.Fatalf("expected error but got none") + } + for _, substr := range tt.errorContains { + if !strings.Contains(err.Error(), substr) { + t.Fatalf("expected error to contain %q, got: %v", substr, err) + } + } + } else { + if err != nil { + t.Fatalf("expected no error, got: %v", err) + } + } + + if result != tt.logs { + t.Fatalf("expected result %q, got %q", tt.logs, result) + } + }) + } +} + +func TestSanitizeForName(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {"worker-0", "worker-0"}, + {"WORKER-0", "worker-0"}, + {"worker.0", "worker-0"}, + {"worker_0", "worker-0"}, + {"ip-10-0-1-42.ec2.internal", "ip-10-0-1-42-ec2-internal"}, + {"", "node"}, + {"---", "node"}, + {strings.Repeat("a", 50), strings.Repeat("a", 40)}, + {"Worker-Node_123.domain", "worker-node-123-domain"}, + } + + for _, tt := range tests { + t.Run(fmt.Sprintf("sanitize(%q)", tt.input), func(t *testing.T) { + result := sanitizeForName(tt.input) + if result != tt.expected { + t.Fatalf("expected %q, got %q", tt.expected, result) + } + }) + } +} diff --git a/pkg/ocp/ocp_client.go b/pkg/ocp/ocp_client.go new file mode 100644 index 00000000..5c2766b4 --- /dev/null +++ b/pkg/ocp/ocp_client.go @@ -0,0 +1,44 @@ +package ocp + +import ( + "context" + + "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" +) + +// ClientsetInterface defines the interface for access-controlled clientset operations. +// This allows code to work with kubernetes.AccessControlClientset through an interface, +// making it easier to test and decouple from the concrete implementation. +type ClientsetInterface interface { + Pods(namespace string) (corev1client.PodInterface, error) +} + +// OpenshiftClient defines a minimal interface for kubernetes operations commonly needed +// by OCP toolsets. This allows for easier testing and decoupling from the concrete +// kubernetes.Kubernetes type. +type OpenshiftClient interface { + NamespaceOrDefault(namespace string) string + AccessControlClientset() ClientsetInterface + PodsLog(ctx context.Context, namespace, name, container string, previous bool, tail int64) (string, error) +} + +// OpenshiftClientAdapter adapts kubernetes.Kubernetes to implement OpenshiftClient. +// This allows production code to use the concrete *kubernetes.Kubernetes type +// while tests can use a mock implementation. +type OpenshiftClientAdapter struct { + *kubernetes.Kubernetes +} + +// NewOpenshiftClient creates a new adapter that wraps kubernetes.Kubernetes +// to implement the OpenshiftClient interface. +func NewOpenshiftClient(k *kubernetes.Kubernetes) *OpenshiftClientAdapter { + return &OpenshiftClientAdapter{Kubernetes: k} +} + +// AccessControlClientset returns the access control clientset as an interface. +// This satisfies the OpenshiftClient interface. +func (c *OpenshiftClientAdapter) AccessControlClientset() ClientsetInterface { + return c.Kubernetes.AccessControlClientset() +} diff --git a/pkg/ocp/testhelpers.go b/pkg/ocp/testhelpers.go new file mode 100644 index 00000000..e35d1d9f --- /dev/null +++ b/pkg/ocp/testhelpers.go @@ -0,0 +1,163 @@ +package ocp + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + schemek8s "k8s.io/client-go/kubernetes/scheme" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" + restclient "k8s.io/client-go/rest" +) + +// NodeDebugTestEnv bundles a test Kubernetes client with a controllable pods client for tests. +type NodeDebugTestEnv struct { + Kubernetes *FakeKubernetesClient + Pods *FakePodInterface +} + +// NewNodeDebugTestEnv constructs a testing harness for exercising NodesDebugExec. +func NewNodeDebugTestEnv(t *testing.T) *NodeDebugTestEnv { + t.Helper() + + podsClient := &FakePodInterface{} + fakeK8s := &FakeKubernetesClient{ + pods: podsClient, + namespace: "default", + } + + return &NodeDebugTestEnv{ + Kubernetes: fakeK8s, + Pods: podsClient, + } +} + +// FakeKubernetesClient implements the OpenshiftClient interface for testing +type FakeKubernetesClient struct { + pods *FakePodInterface + namespace string +} + +// AccessControlClientset returns a fake clientset for testing +func (f *FakeKubernetesClient) AccessControlClientset() ClientsetInterface { + return &FakeAccessControlClientset{pods: f.pods} +} + +func (f *FakeKubernetesClient) NamespaceOrDefault(namespace string) string { + if namespace == "" { + return f.namespace + } + return namespace +} + +func (f *FakeKubernetesClient) PodsLog(ctx context.Context, namespace, name, container string, previous bool, tail int64) (string, error) { + req := f.pods.GetLogs(name, &corev1.PodLogOptions{Container: container, Previous: previous}) + res := req.Do(ctx) + if res.Error() != nil { + return "", res.Error() + } + rawData, err := res.Raw() + if err != nil { + return "", err + } + return string(rawData), nil +} + +// FakeAccessControlClientset mimics kubernetes.AccessControlClientset for testing +type FakeAccessControlClientset struct { + pods *FakePodInterface +} + +func (f *FakeAccessControlClientset) Pods(namespace string) (corev1client.PodInterface, error) { + return f.pods, nil +} + +// FakePodInterface implements corev1client.PodInterface with deterministic behaviour for tests. +type FakePodInterface struct { + corev1client.PodInterface + Created *corev1.Pod + Deleted bool + ExitCode int32 + TerminatedReason string + TerminatedMessage string + WaitingReason string + WaitingMessage string + Logs string +} + +func (f *FakePodInterface) Create(ctx context.Context, pod *corev1.Pod, opts metav1.CreateOptions) (*corev1.Pod, error) { + copy := pod.DeepCopy() + if copy.Name == "" && copy.GenerateName != "" { + copy.Name = copy.GenerateName + "test" + } + f.Created = copy + return copy.DeepCopy(), nil +} + +func (f *FakePodInterface) Get(ctx context.Context, name string, opts metav1.GetOptions) (*corev1.Pod, error) { + if f.Created == nil { + return nil, fmt.Errorf("pod not created yet") + } + pod := f.Created.DeepCopy() + + // If waiting state is set, return that instead of terminated + if f.WaitingReason != "" { + waiting := &corev1.ContainerStateWaiting{Reason: f.WaitingReason} + if f.WaitingMessage != "" { + waiting.Message = f.WaitingMessage + } + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: NodeDebugContainerName, + State: corev1.ContainerState{Waiting: waiting}, + }} + pod.Status.Phase = corev1.PodPending + return pod, nil + } + + // Otherwise return terminated state + terminated := &corev1.ContainerStateTerminated{ExitCode: f.ExitCode} + if f.TerminatedReason != "" { + terminated.Reason = f.TerminatedReason + } + if f.TerminatedMessage != "" { + terminated.Message = f.TerminatedMessage + } + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: NodeDebugContainerName, + State: corev1.ContainerState{Terminated: terminated}, + }} + pod.Status.Phase = corev1.PodSucceeded + return pod, nil +} + +func (f *FakePodInterface) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error { + f.Deleted = true + return nil +} + +func (f *FakePodInterface) GetLogs(name string, opts *corev1.PodLogOptions) *restclient.Request { + body := io.NopCloser(strings.NewReader(f.Logs)) + client := &http.Client{Transport: roundTripperFunc(func(*http.Request) (*http.Response, error) { + return &http.Response{StatusCode: http.StatusOK, Body: body}, nil + })} + content := restclient.ClientContentConfig{ + ContentType: runtime.ContentTypeJSON, + GroupVersion: schema.GroupVersion{Version: "v1"}, + Negotiator: runtime.NewClientNegotiator(schemek8s.Codecs.WithoutConversion(), schema.GroupVersion{Version: "v1"}), + } + return restclient.NewRequestWithClient(&url.URL{Scheme: "https", Host: "localhost"}, "", content, client).Verb("GET") +} + +type roundTripperFunc func(*http.Request) (*http.Response, error) + +func (f roundTripperFunc) RoundTrip(req *http.Request) (*http.Response, error) { + return f(req) +} diff --git a/pkg/toolsets/openshift/nodes.go b/pkg/toolsets/openshift/nodes.go new file mode 100644 index 00000000..3f7fc688 --- /dev/null +++ b/pkg/toolsets/openshift/nodes.go @@ -0,0 +1,126 @@ +package openshift + +import ( + "errors" + "fmt" + "time" + + "github.com/google/jsonschema-go/jsonschema" + "k8s.io/utils/ptr" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/ocp" +) + +func initNodes() []api.ServerTool { + return []api.ServerTool{ + { + Tool: api.Tool{ + Name: "nodes_debug_exec", + Description: "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). Commands execute in a chroot of the host filesystem, providing full access to node-level diagnostics. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + InputSchema: &jsonschema.Schema{ + Type: "object", + Properties: map[string]*jsonschema.Schema{ + "node": { + Type: "string", + Description: "Name of the node to debug (e.g. worker-0).", + }, + "command": { + Type: "array", + Description: "Command to execute on the node via chroot. All standard debugging utilities are available including systemctl, journalctl, ss, ip, ping, traceroute, nmap, ps, top, lsof, strace, find, tar, rsync, gdb, and more. Provide each argument as a separate array item (e.g. ['systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + Items: &jsonschema.Schema{Type: "string"}, + }, + "namespace": { + Type: "string", + Description: "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + }, + "image": { + Type: "string", + Description: "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + }, + "timeout_seconds": { + Type: "integer", + Description: "Maximum time to wait for the command to complete before timing out (optional, defaults to 300 seconds).", + Minimum: ptr.To(float64(1)), + }, + }, + Required: []string{"node", "command"}, + }, + Annotations: api.ToolAnnotations{ + Title: "Nodes: Debug Exec", + ReadOnlyHint: ptr.To(false), + DestructiveHint: ptr.To(true), + IdempotentHint: ptr.To(false), + OpenWorldHint: ptr.To(true), + }, + }, + Handler: nodesDebugExec, + }, + } +} + +func nodesDebugExec(params api.ToolHandlerParams) (*api.ToolCallResult, error) { + nodeArg := params.GetArguments()["node"] + nodeName, ok := nodeArg.(string) + if nodeArg == nil || !ok || nodeName == "" { + return api.NewToolCallResult("", errors.New("missing required argument: node")), nil + } + + commandArg := params.GetArguments()["command"] + command, err := toStringSlice(commandArg) + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("invalid command argument: %w", err)), nil + } + + namespace := "" + if nsArg, ok := params.GetArguments()["namespace"].(string); ok { + namespace = nsArg + } + + image := "" + if imageArg, ok := params.GetArguments()["image"].(string); ok { + image = imageArg + } + + var timeout time.Duration + if timeoutRaw, exists := params.GetArguments()["timeout_seconds"]; exists && timeoutRaw != nil { + switch v := timeoutRaw.(type) { + case float64: + timeout = time.Duration(int64(v)) * time.Second + case int: + timeout = time.Duration(v) * time.Second + case int64: + timeout = time.Duration(v) * time.Second + default: + return api.NewToolCallResult("", errors.New("timeout_seconds must be a numeric value")), nil + } + } + + output, execErr := ocp.NodesDebugExec(params.Context, ocp.NewOpenshiftClient(params.Kubernetes), namespace, nodeName, image, command, timeout) + if output == "" && execErr == nil { + output = fmt.Sprintf("Command executed successfully on node %s but produced no output.", nodeName) + } + return api.NewToolCallResult(output, execErr), nil +} + +func toStringSlice(arg any) ([]string, error) { + if arg == nil { + return nil, errors.New("command is required") + } + raw, ok := arg.([]interface{}) + if !ok { + return nil, errors.New("command must be an array of strings") + } + if len(raw) == 0 { + return nil, errors.New("command array cannot be empty") + } + command := make([]string, 0, len(raw)) + for _, item := range raw { + str, ok := item.(string) + if !ok { + return nil, errors.New("command items must be strings") + } + command = append(command, str) + } + return command, nil +} diff --git a/pkg/toolsets/openshift/nodes_test.go b/pkg/toolsets/openshift/nodes_test.go new file mode 100644 index 00000000..7292de9b --- /dev/null +++ b/pkg/toolsets/openshift/nodes_test.go @@ -0,0 +1,95 @@ +package openshift + +import ( + "context" + "testing" + "time" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/ocp" +) + +type staticRequest struct { + args map[string]any +} + +func (s staticRequest) GetArguments() map[string]any { + return s.args +} + +func TestNodesDebugExecHandlerValidatesInput(t *testing.T) { + t.Run("missing node", func(t *testing.T) { + params := api.ToolHandlerParams{ + Context: context.Background(), + ToolCallRequest: staticRequest{args: map[string]any{}}, + } + result, err := nodesDebugExec(params) + if err != nil { + t.Fatalf("handler returned error: %v", err) + } + if result.Error == nil || result.Error.Error() != "missing required argument: node" { + t.Fatalf("unexpected error: %v", result.Error) + } + }) + + t.Run("invalid command type", func(t *testing.T) { + params := api.ToolHandlerParams{ + Context: context.Background(), + ToolCallRequest: staticRequest{args: map[string]any{ + "node": "worker-0", + "command": "ls -la", + }}, + } + result, err := nodesDebugExec(params) + if err != nil { + t.Fatalf("handler returned error: %v", err) + } + if result.Error == nil || result.Error.Error() != "invalid command argument: command must be an array of strings" { + t.Fatalf("unexpected error: %v", result.Error) + } + }) +} + +func TestNodesDebugExecHandlerExecutesCommand(t *testing.T) { + env := ocp.NewNodeDebugTestEnv(t) + env.Pods.Logs = "done" + + // Call NodesDebugExec directly instead of going through the handler + // This avoids the need to mock the full kubernetes.Kubernetes type + output, err := ocp.NodesDebugExec( + context.Background(), + env.Kubernetes, + "debug", + "infra-node", + "registry.local/debug:latest", + []string{"systemctl", "status", "kubelet"}, + 15*time.Second, + ) + + if err != nil { + t.Fatalf("NodesDebugExec returned error: %v", err) + } + if output != "done" { + t.Fatalf("unexpected output: %q", output) + } + + created := env.Pods.Created + if created == nil { + t.Fatalf("expected pod creation") + } + if created.Namespace != "debug" { + t.Fatalf("expected namespace override, got %q", created.Namespace) + } + if created.Spec.Containers[0].Image != "registry.local/debug:latest" { + t.Fatalf("expected custom image, got %q", created.Spec.Containers[0].Image) + } + expectedCommand := []string{"chroot", "/host", "systemctl", "status", "kubelet"} + if len(created.Spec.Containers[0].Command) != len(expectedCommand) { + t.Fatalf("unexpected command length: %v", created.Spec.Containers[0].Command) + } + for i, part := range expectedCommand { + if created.Spec.Containers[0].Command[i] != part { + t.Fatalf("command[%d]=%q expected %q", i, created.Spec.Containers[0].Command[i], part) + } + } +} diff --git a/pkg/toolsets/openshift/toolset.go b/pkg/toolsets/openshift/toolset.go new file mode 100644 index 00000000..d11e4b66 --- /dev/null +++ b/pkg/toolsets/openshift/toolset.go @@ -0,0 +1,31 @@ +package openshift + +import ( + "slices" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + "github.com/containers/kubernetes-mcp-server/pkg/toolsets" +) + +type Toolset struct{} + +var _ api.Toolset = (*Toolset)(nil) + +func (t *Toolset) GetName() string { + return "openshift-core" +} + +func (t *Toolset) GetDescription() string { + return "Core OpenShift-specific tools (Node debugging, etc.)" +} + +func (t *Toolset) GetTools(o internalk8s.Openshift) []api.ServerTool { + return slices.Concat( + initNodes(), + ) +} + +func init() { + toolsets.Register(&Toolset{}) +}