Skip to content

Commit fb7eed5

Browse files
committed
Add node debug tool with tests
1 parent 7107a24 commit fb7eed5

File tree

13 files changed

+1055
-2
lines changed

13 files changed

+1055
-2
lines changed

README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ A powerful and flexible Kubernetes [Model Context Protocol (MCP)](https://blog.m
1919
- **Top** gets resource usage metrics for all pods or a specific pod in the specified namespace.
2020
- **Exec** into a pod and run a command.
2121
- **Run** a container image in a pod and optionally expose it.
22+
- **Node debug** run privileged commands directly on cluster nodes via a managed debug pod.
2223
- **✅ Namespaces**: List Kubernetes Namespaces.
2324
- **✅ Events**: View Kubernetes events in all namespaces or in a specific namespace.
2425
- **✅ Projects**: List OpenShift Projects.
@@ -220,6 +221,24 @@ List all the Kubernetes namespaces in the current cluster
220221

221222
**Parameters:** None
222223

224+
### `nodes_debug_exec`
225+
226+
Run commands on an OpenShift node by creating a short-lived privileged debug pod that automatically chroots into the host filesystem. Output is limited to the latest 100 log lines, so use filtering (e.g., `grep`, `journalctl --since`, etc.) for high-volume commands.
227+
228+
**Parameters:**
229+
- `node` (`string`, required)
230+
- Name of the node to target (for example `worker-0`)
231+
- `command` (`string[]`, required)
232+
- Command and arguments to run inside the node's host filesystem
233+
- Example: `["systemctl", "status", "kubelet"]`
234+
- `namespace` (`string`, optional)
235+
- Namespace used for the temporary debug pod
236+
- Defaults to the configured namespace or `default`
237+
- `image` (`string`, optional)
238+
- Override the container image used for the debug pod
239+
- `timeout_seconds` (`integer`, optional)
240+
- Maximum time to wait for the command to finish (defaults to 300 seconds)
241+
223242
### `pods_delete`
224243

225244
Delete a Kubernetes Pod in the current or provided namespace with the provided name

pkg/kubernetes/kubernetes.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"strings"
77

88
"k8s.io/apimachinery/pkg/runtime"
9+
corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
910

1011
"github.com/fsnotify/fsnotify"
1112

@@ -38,7 +39,8 @@ const (
3839
type CloseWatchKubeConfig func() error
3940

4041
type Kubernetes struct {
41-
manager *Manager
42+
manager *Manager
43+
podClientFactory func(namespace string) (corev1client.PodInterface, error)
4244
}
4345

4446
type Manager struct {
@@ -209,6 +211,16 @@ func (m *Manager) Derived(ctx context.Context) (*Kubernetes, error) {
209211
return derived, nil
210212
}
211213

214+
func (k *Kubernetes) podsClient(namespace string) (corev1client.PodInterface, error) {
215+
if k.podClientFactory != nil {
216+
return k.podClientFactory(namespace)
217+
}
218+
if k.manager == nil || k.manager.accessControlClientSet == nil {
219+
return nil, errors.New("kubernetes manager is not initialized")
220+
}
221+
return k.manager.accessControlClientSet.Pods(namespace)
222+
}
223+
212224
func (k *Kubernetes) NewHelm() *helm.Helm {
213225
// This is a derived Kubernetes, so it already has the Helm initialized
214226
return helm.NewHelm(k.manager)

pkg/kubernetes/nodes.go

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
package kubernetes
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"strings"
8+
"time"
9+
10+
"github.com/containers/kubernetes-mcp-server/pkg/version"
11+
12+
corev1 "k8s.io/api/core/v1"
13+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14+
"k8s.io/utils/ptr"
15+
)
16+
17+
const (
18+
// defaultNodeDebugImage is a lightweight image that provides the tooling required to run chroot.
19+
defaultNodeDebugImage = "quay.io/fedora/fedora:latest"
20+
// nodeDebugContainerName is the name used for the debug container, matching oc debug defaults.
21+
nodeDebugContainerName = "debug"
22+
// defaultNodeDebugTimeout is the maximum time to wait for the debug pod to finish executing.
23+
defaultNodeDebugTimeout = 5 * time.Minute
24+
)
25+
26+
// NodesDebugExec mimics `oc debug node/<name> -- <command...>` by creating a privileged pod on the target
27+
// node, running the provided command within a chroot of the host filesystem, collecting its output, and
28+
// removing the pod afterwards.
29+
//
30+
// When namespace is empty, the configured namespace (or "default" if none) is used. When image is empty the
31+
// default debug image is used. Timeout controls how long we wait for the pod to complete.
32+
func (k *Kubernetes) NodesDebugExec(
33+
ctx context.Context,
34+
namespace string,
35+
nodeName string,
36+
image string,
37+
command []string,
38+
timeout time.Duration,
39+
) (string, error) {
40+
if nodeName == "" {
41+
return "", errors.New("node name is required")
42+
}
43+
if len(command) == 0 {
44+
return "", errors.New("command is required")
45+
}
46+
47+
ns := k.NamespaceOrDefault(namespace)
48+
if ns == "" {
49+
ns = "default"
50+
}
51+
debugImage := image
52+
if debugImage == "" {
53+
debugImage = defaultNodeDebugImage
54+
}
55+
if timeout <= 0 {
56+
timeout = defaultNodeDebugTimeout
57+
}
58+
59+
podsClient, err := k.podsClient(ns)
60+
if err != nil {
61+
return "", fmt.Errorf("failed to get pod client for namespace %s: %w", ns, err)
62+
}
63+
64+
sanitizedNode := sanitizeForName(nodeName)
65+
hostPathType := corev1.HostPathDirectory
66+
67+
debugPod := &corev1.Pod{
68+
ObjectMeta: metav1.ObjectMeta{
69+
GenerateName: fmt.Sprintf("node-debug-%s-", sanitizedNode),
70+
Namespace: ns,
71+
Labels: map[string]string{
72+
AppKubernetesManagedBy: version.BinaryName,
73+
AppKubernetesComponent: "node-debug",
74+
AppKubernetesName: fmt.Sprintf("node-debug-%s", sanitizedNode),
75+
},
76+
},
77+
Spec: corev1.PodSpec{
78+
AutomountServiceAccountToken: ptr.To(false),
79+
NodeName: nodeName,
80+
RestartPolicy: corev1.RestartPolicyNever,
81+
SecurityContext: &corev1.PodSecurityContext{
82+
RunAsUser: ptr.To[int64](0),
83+
},
84+
Tolerations: []corev1.Toleration{
85+
{Operator: corev1.TolerationOpExists},
86+
{Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule},
87+
{Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoExecute},
88+
},
89+
Volumes: []corev1.Volume{
90+
{
91+
Name: "host-root",
92+
VolumeSource: corev1.VolumeSource{
93+
HostPath: &corev1.HostPathVolumeSource{
94+
Path: "/",
95+
Type: &hostPathType,
96+
},
97+
},
98+
},
99+
},
100+
Containers: []corev1.Container{
101+
{
102+
Name: nodeDebugContainerName,
103+
Image: debugImage,
104+
ImagePullPolicy: corev1.PullIfNotPresent,
105+
Command: append([]string{"chroot", "/host"}, command...),
106+
SecurityContext: &corev1.SecurityContext{
107+
Privileged: ptr.To(true),
108+
RunAsUser: ptr.To[int64](0),
109+
},
110+
VolumeMounts: []corev1.VolumeMount{
111+
{Name: "host-root", MountPath: "/host"},
112+
},
113+
},
114+
},
115+
},
116+
}
117+
118+
created, err := podsClient.Create(ctx, debugPod, metav1.CreateOptions{})
119+
if err != nil {
120+
return "", fmt.Errorf("failed to create debug pod: %w", err)
121+
}
122+
123+
// Ensure the pod is deleted regardless of completion state.
124+
defer func() {
125+
deleteCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
126+
defer cancel()
127+
grace := int64(0)
128+
_ = podsClient.Delete(deleteCtx, created.Name, metav1.DeleteOptions{GracePeriodSeconds: &grace})
129+
}()
130+
131+
pollCtx, cancel := context.WithTimeout(ctx, timeout)
132+
defer cancel()
133+
134+
ticker := time.NewTicker(2 * time.Second)
135+
defer ticker.Stop()
136+
137+
var (
138+
lastPod *corev1.Pod
139+
terminated *corev1.ContainerStateTerminated
140+
waitMsg string
141+
)
142+
143+
for {
144+
select {
145+
case <-pollCtx.Done():
146+
return "", fmt.Errorf("timed out waiting for debug pod %s to complete: %w", created.Name, pollCtx.Err())
147+
default:
148+
}
149+
150+
current, getErr := podsClient.Get(pollCtx, created.Name, metav1.GetOptions{})
151+
if getErr != nil {
152+
return "", fmt.Errorf("failed to get debug pod status: %w", getErr)
153+
}
154+
lastPod = current
155+
156+
if status := containerStatusByName(current.Status.ContainerStatuses, nodeDebugContainerName); status != nil {
157+
if status.State.Waiting != nil {
158+
waitMsg = fmt.Sprintf("container waiting: %s", status.State.Waiting.Reason)
159+
// Image pull issues should fail fast.
160+
if status.State.Waiting.Reason == "ErrImagePull" || status.State.Waiting.Reason == "ImagePullBackOff" {
161+
return "", fmt.Errorf("debug container failed to start (%s): %s", status.State.Waiting.Reason, status.State.Waiting.Message)
162+
}
163+
}
164+
if status.State.Terminated != nil {
165+
terminated = status.State.Terminated
166+
break
167+
}
168+
}
169+
170+
if current.Status.Phase == corev1.PodFailed {
171+
break
172+
}
173+
174+
select {
175+
case <-pollCtx.Done():
176+
return "", fmt.Errorf("timed out waiting for debug pod %s to complete: %w", created.Name, pollCtx.Err())
177+
case <-ticker.C:
178+
}
179+
}
180+
181+
logCtx, logCancel := context.WithTimeout(context.Background(), 30*time.Second)
182+
defer logCancel()
183+
logs, logErr := k.PodsLog(logCtx, ns, created.Name, nodeDebugContainerName, false, 0)
184+
if logErr != nil {
185+
return "", fmt.Errorf("failed to retrieve debug pod logs: %w", logErr)
186+
}
187+
logs = strings.TrimSpace(logs)
188+
189+
if terminated != nil {
190+
if terminated.ExitCode != 0 {
191+
errMsg := fmt.Sprintf("command exited with code %d", terminated.ExitCode)
192+
if terminated.Reason != "" {
193+
errMsg = fmt.Sprintf("%s (%s)", errMsg, terminated.Reason)
194+
}
195+
if terminated.Message != "" {
196+
errMsg = fmt.Sprintf("%s: %s", errMsg, terminated.Message)
197+
}
198+
return logs, errors.New(errMsg)
199+
}
200+
return logs, nil
201+
}
202+
203+
if lastPod != nil && lastPod.Status.Reason != "" {
204+
return logs, fmt.Errorf("debug pod failed: %s", lastPod.Status.Reason)
205+
}
206+
if waitMsg != "" {
207+
return logs, fmt.Errorf("debug container did not complete: %s", waitMsg)
208+
}
209+
return logs, errors.New("debug container did not reach a terminal state")
210+
}
211+
212+
func sanitizeForName(name string) string {
213+
lower := strings.ToLower(name)
214+
var b strings.Builder
215+
b.Grow(len(lower))
216+
for _, r := range lower {
217+
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' {
218+
b.WriteRune(r)
219+
continue
220+
}
221+
b.WriteRune('-')
222+
}
223+
sanitized := strings.Trim(b.String(), "-")
224+
if sanitized == "" {
225+
sanitized = "node"
226+
}
227+
if len(sanitized) > 40 {
228+
sanitized = sanitized[:40]
229+
}
230+
return sanitized
231+
}
232+
233+
func containerStatusByName(statuses []corev1.ContainerStatus, name string) *corev1.ContainerStatus {
234+
for idx := range statuses {
235+
if statuses[idx].Name == name {
236+
return &statuses[idx]
237+
}
238+
}
239+
return nil
240+
}

0 commit comments

Comments
 (0)