|
| 1 | +package kubernetes |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "errors" |
| 6 | + "fmt" |
| 7 | + "strings" |
| 8 | + "time" |
| 9 | + |
| 10 | + "github.com/containers/kubernetes-mcp-server/pkg/version" |
| 11 | + |
| 12 | + corev1 "k8s.io/api/core/v1" |
| 13 | + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
| 14 | + "k8s.io/utils/ptr" |
| 15 | +) |
| 16 | + |
| 17 | +const ( |
| 18 | + // defaultNodeDebugImage is a lightweight image that provides the tooling required to run chroot. |
| 19 | + defaultNodeDebugImage = "quay.io/fedora/fedora:latest" |
| 20 | + // nodeDebugContainerName is the name used for the debug container, matching oc debug defaults. |
| 21 | + nodeDebugContainerName = "debug" |
| 22 | + // defaultNodeDebugTimeout is the maximum time to wait for the debug pod to finish executing. |
| 23 | + defaultNodeDebugTimeout = 5 * time.Minute |
| 24 | +) |
| 25 | + |
| 26 | +// NodesDebugExec mimics `oc debug node/<name> -- <command...>` by creating a privileged pod on the target |
| 27 | +// node, running the provided command within a chroot of the host filesystem, collecting its output, and |
| 28 | +// removing the pod afterwards. |
| 29 | +// |
| 30 | +// When namespace is empty, the configured namespace (or "default" if none) is used. When image is empty the |
| 31 | +// default debug image is used. Timeout controls how long we wait for the pod to complete. |
| 32 | +func (k *Kubernetes) NodesDebugExec( |
| 33 | + ctx context.Context, |
| 34 | + namespace string, |
| 35 | + nodeName string, |
| 36 | + image string, |
| 37 | + command []string, |
| 38 | + timeout time.Duration, |
| 39 | +) (string, error) { |
| 40 | + if nodeName == "" { |
| 41 | + return "", errors.New("node name is required") |
| 42 | + } |
| 43 | + if len(command) == 0 { |
| 44 | + return "", errors.New("command is required") |
| 45 | + } |
| 46 | + |
| 47 | + ns := k.NamespaceOrDefault(namespace) |
| 48 | + if ns == "" { |
| 49 | + ns = "default" |
| 50 | + } |
| 51 | + debugImage := image |
| 52 | + if debugImage == "" { |
| 53 | + debugImage = defaultNodeDebugImage |
| 54 | + } |
| 55 | + if timeout <= 0 { |
| 56 | + timeout = defaultNodeDebugTimeout |
| 57 | + } |
| 58 | + |
| 59 | + podsClient, err := k.podsClient(ns) |
| 60 | + if err != nil { |
| 61 | + return "", fmt.Errorf("failed to get pod client for namespace %s: %w", ns, err) |
| 62 | + } |
| 63 | + |
| 64 | + sanitizedNode := sanitizeForName(nodeName) |
| 65 | + hostPathType := corev1.HostPathDirectory |
| 66 | + |
| 67 | + debugPod := &corev1.Pod{ |
| 68 | + ObjectMeta: metav1.ObjectMeta{ |
| 69 | + GenerateName: fmt.Sprintf("node-debug-%s-", sanitizedNode), |
| 70 | + Namespace: ns, |
| 71 | + Labels: map[string]string{ |
| 72 | + AppKubernetesManagedBy: version.BinaryName, |
| 73 | + AppKubernetesComponent: "node-debug", |
| 74 | + AppKubernetesName: fmt.Sprintf("node-debug-%s", sanitizedNode), |
| 75 | + }, |
| 76 | + }, |
| 77 | + Spec: corev1.PodSpec{ |
| 78 | + AutomountServiceAccountToken: ptr.To(false), |
| 79 | + NodeName: nodeName, |
| 80 | + RestartPolicy: corev1.RestartPolicyNever, |
| 81 | + SecurityContext: &corev1.PodSecurityContext{ |
| 82 | + RunAsUser: ptr.To[int64](0), |
| 83 | + }, |
| 84 | + Tolerations: []corev1.Toleration{ |
| 85 | + {Operator: corev1.TolerationOpExists}, |
| 86 | + {Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}, |
| 87 | + {Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoExecute}, |
| 88 | + }, |
| 89 | + Volumes: []corev1.Volume{ |
| 90 | + { |
| 91 | + Name: "host-root", |
| 92 | + VolumeSource: corev1.VolumeSource{ |
| 93 | + HostPath: &corev1.HostPathVolumeSource{ |
| 94 | + Path: "/", |
| 95 | + Type: &hostPathType, |
| 96 | + }, |
| 97 | + }, |
| 98 | + }, |
| 99 | + }, |
| 100 | + Containers: []corev1.Container{ |
| 101 | + { |
| 102 | + Name: nodeDebugContainerName, |
| 103 | + Image: debugImage, |
| 104 | + ImagePullPolicy: corev1.PullIfNotPresent, |
| 105 | + Command: append([]string{"chroot", "/host"}, command...), |
| 106 | + SecurityContext: &corev1.SecurityContext{ |
| 107 | + Privileged: ptr.To(true), |
| 108 | + RunAsUser: ptr.To[int64](0), |
| 109 | + }, |
| 110 | + VolumeMounts: []corev1.VolumeMount{ |
| 111 | + {Name: "host-root", MountPath: "/host"}, |
| 112 | + }, |
| 113 | + }, |
| 114 | + }, |
| 115 | + }, |
| 116 | + } |
| 117 | + |
| 118 | + created, err := podsClient.Create(ctx, debugPod, metav1.CreateOptions{}) |
| 119 | + if err != nil { |
| 120 | + return "", fmt.Errorf("failed to create debug pod: %w", err) |
| 121 | + } |
| 122 | + |
| 123 | + // Ensure the pod is deleted regardless of completion state. |
| 124 | + defer func() { |
| 125 | + deleteCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) |
| 126 | + defer cancel() |
| 127 | + grace := int64(0) |
| 128 | + _ = podsClient.Delete(deleteCtx, created.Name, metav1.DeleteOptions{GracePeriodSeconds: &grace}) |
| 129 | + }() |
| 130 | + |
| 131 | + pollCtx, cancel := context.WithTimeout(ctx, timeout) |
| 132 | + defer cancel() |
| 133 | + |
| 134 | + ticker := time.NewTicker(2 * time.Second) |
| 135 | + defer ticker.Stop() |
| 136 | + |
| 137 | + var ( |
| 138 | + lastPod *corev1.Pod |
| 139 | + terminated *corev1.ContainerStateTerminated |
| 140 | + waitMsg string |
| 141 | + ) |
| 142 | + |
| 143 | + for { |
| 144 | + select { |
| 145 | + case <-pollCtx.Done(): |
| 146 | + return "", fmt.Errorf("timed out waiting for debug pod %s to complete: %w", created.Name, pollCtx.Err()) |
| 147 | + default: |
| 148 | + } |
| 149 | + |
| 150 | + current, getErr := podsClient.Get(pollCtx, created.Name, metav1.GetOptions{}) |
| 151 | + if getErr != nil { |
| 152 | + return "", fmt.Errorf("failed to get debug pod status: %w", getErr) |
| 153 | + } |
| 154 | + lastPod = current |
| 155 | + |
| 156 | + if status := containerStatusByName(current.Status.ContainerStatuses, nodeDebugContainerName); status != nil { |
| 157 | + if status.State.Waiting != nil { |
| 158 | + waitMsg = fmt.Sprintf("container waiting: %s", status.State.Waiting.Reason) |
| 159 | + // Image pull issues should fail fast. |
| 160 | + if status.State.Waiting.Reason == "ErrImagePull" || status.State.Waiting.Reason == "ImagePullBackOff" { |
| 161 | + return "", fmt.Errorf("debug container failed to start (%s): %s", status.State.Waiting.Reason, status.State.Waiting.Message) |
| 162 | + } |
| 163 | + } |
| 164 | + if status.State.Terminated != nil { |
| 165 | + terminated = status.State.Terminated |
| 166 | + break |
| 167 | + } |
| 168 | + } |
| 169 | + |
| 170 | + if current.Status.Phase == corev1.PodFailed { |
| 171 | + break |
| 172 | + } |
| 173 | + |
| 174 | + select { |
| 175 | + case <-pollCtx.Done(): |
| 176 | + return "", fmt.Errorf("timed out waiting for debug pod %s to complete: %w", created.Name, pollCtx.Err()) |
| 177 | + case <-ticker.C: |
| 178 | + } |
| 179 | + } |
| 180 | + |
| 181 | + logCtx, logCancel := context.WithTimeout(context.Background(), 30*time.Second) |
| 182 | + defer logCancel() |
| 183 | + logs, logErr := k.PodsLog(logCtx, ns, created.Name, nodeDebugContainerName, false, 0) |
| 184 | + if logErr != nil { |
| 185 | + return "", fmt.Errorf("failed to retrieve debug pod logs: %w", logErr) |
| 186 | + } |
| 187 | + logs = strings.TrimSpace(logs) |
| 188 | + |
| 189 | + if terminated != nil { |
| 190 | + if terminated.ExitCode != 0 { |
| 191 | + errMsg := fmt.Sprintf("command exited with code %d", terminated.ExitCode) |
| 192 | + if terminated.Reason != "" { |
| 193 | + errMsg = fmt.Sprintf("%s (%s)", errMsg, terminated.Reason) |
| 194 | + } |
| 195 | + if terminated.Message != "" { |
| 196 | + errMsg = fmt.Sprintf("%s: %s", errMsg, terminated.Message) |
| 197 | + } |
| 198 | + return logs, errors.New(errMsg) |
| 199 | + } |
| 200 | + return logs, nil |
| 201 | + } |
| 202 | + |
| 203 | + if lastPod != nil && lastPod.Status.Reason != "" { |
| 204 | + return logs, fmt.Errorf("debug pod failed: %s", lastPod.Status.Reason) |
| 205 | + } |
| 206 | + if waitMsg != "" { |
| 207 | + return logs, fmt.Errorf("debug container did not complete: %s", waitMsg) |
| 208 | + } |
| 209 | + return logs, errors.New("debug container did not reach a terminal state") |
| 210 | +} |
| 211 | + |
| 212 | +func sanitizeForName(name string) string { |
| 213 | + lower := strings.ToLower(name) |
| 214 | + var b strings.Builder |
| 215 | + b.Grow(len(lower)) |
| 216 | + for _, r := range lower { |
| 217 | + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' { |
| 218 | + b.WriteRune(r) |
| 219 | + continue |
| 220 | + } |
| 221 | + b.WriteRune('-') |
| 222 | + } |
| 223 | + sanitized := strings.Trim(b.String(), "-") |
| 224 | + if sanitized == "" { |
| 225 | + sanitized = "node" |
| 226 | + } |
| 227 | + if len(sanitized) > 40 { |
| 228 | + sanitized = sanitized[:40] |
| 229 | + } |
| 230 | + return sanitized |
| 231 | +} |
| 232 | + |
| 233 | +func containerStatusByName(statuses []corev1.ContainerStatus, name string) *corev1.ContainerStatus { |
| 234 | + for idx := range statuses { |
| 235 | + if statuses[idx].Name == name { |
| 236 | + return &statuses[idx] |
| 237 | + } |
| 238 | + } |
| 239 | + return nil |
| 240 | +} |
0 commit comments