diff --git a/plugins/sagemaker-ai/.claude-plugin/plugin.json b/plugins/sagemaker-ai/.claude-plugin/plugin.json
index b0f4dd4d..64c015fc 100644
--- a/plugins/sagemaker-ai/.claude-plugin/plugin.json
+++ b/plugins/sagemaker-ai/.claude-plugin/plugin.json
@@ -18,5 +18,5 @@
   "license": "Apache-2.0",
   "name": "sagemaker-ai",
   "repository": "https://github.com/awslabs/agent-plugins",
-  "version": "1.1.0"
+  "version": "1.1.1"
 }
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/SKILL.md
new file mode 100644
index 00000000..0973717d
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/SKILL.md
@@ -0,0 +1,267 @@
+---
+name: hyperpod-cluster-debugger
+description: Use for cluster-wide SageMaker HyperPod issues (EKS or Slurm) across the full cluster lifecycle — pre-create validation and creation/deployment failures (CloudFormation CREATE_FAILED / ROLLBACK_COMPLETE / "Embedded stack failed", stuck in Creating/Updating/Failed, "EFA health checks did not run successfully", "Lifecycle scripts did not run" / timed out, "Insufficient capacity" / "No subnets in the capacity AZ", "Instance bootstrap failed...network misconfiguration", service-linked role missing, S3 lifecycle / CRLF / on_create.sh); plus post-deployment ops (EKS access entries / kubectl auth, EKS add-ons, AMI / UpdateClusterSoftware rollback, ClusterMaintenanceRollbackFailed, dangling nodes, autoscaler/Karpenter conflicts, service quotas, permission-boundary denials, Slurm controller). Read-only. `--validate` pre-flight checks SGs / subnets / IAM / VPC endpoints / S3 lifecycle / per-AZ capacity. Not for per-node issues (hyperpod-node-debugger), NCCL (hyperpod-nccl), or MFU (hyperpod-mfu-debugger).
+metadata:
+  version: "1.0.0"
+---
+
+# HyperPod Cluster Debugger
+
+Read-only diagnostic for cluster-level HyperPod issues across the full cluster lifecycle: **pre-create validation**, **deployment/creation failures**, and **post-deployment operations** (cluster-wide health, AMI upgrades, dangling nodes, autoscaler conflicts, Slurm controller, node replacement). Supports **EKS** and **Slurm**.
+
+**Clear separation of concerns:**
+
+- `scripts/diagnose-cluster.sh` is a **read-only signal collector**. It reads cluster state via AWS APIs and (for Slurm controller health) SSM, then prints each detected issue as a `[FAIL]` line. Every `[FAIL]` line ends with a pointer of the form `→ references/cluster-diagnostics-detail.md § <section>` or `→ references/cluster-operations.md § <N>`. The script never prints remediation commands and never modifies cluster state.
+- [references/cluster-diagnostics-detail.md](references/cluster-diagnostics-detail.md) contains the full remediation runbook per section (A–L).
+- [references/cluster-operations.md](references/cluster-operations.md) contains operational deep-dives (EFA SG, capacity, lifecycle, EKS access, SSM, node replacement, Slurm operations).
+- [references/cloudformation-errors.md](references/cloudformation-errors.md) is the full CloudFormation resource-by-resource error catalog (nested-stack navigation, `AWS::SageMaker::Cluster`/`AWS::IAM::Role`/`AWS::FSx::FileSystem`/`Custom::Resource`/etc.) — open this when § H points you into deep CFN debugging.
+- [references/capacity-planning.md](references/capacity-planning.md) is the in-depth capacity strategy guide (on-demand vs. Flexible Training Plans vs. ODCR, AZ/AZ-ID selection, subnet IP sizing per instance type, service quotas) — open this when § B or pre-create validation flags capacity/subnet sizing.
+- [references/lifecycle-scripts.md](references/lifecycle-scripts.md) is the in-depth lifecycle-script reference (S3 layout for Slurm/EKS, execution order, `config.py` toggles, on-node debug under `/var/log/provision/`) — open this when § C points you at a specific lifecycle failure.
+- This SKILL.md is the **playbook for Claude**: run the script, read each finding's pointer, open the referenced section, walk the customer through the fix with explicit approval.
+
+**Always run Step 1 first** — it collects all cluster signals and produces a prioritized issue list with reference pointers.
+
+---
+
+## Workflow (authoritative)
+
+1. **Collect inputs** — HyperPod cluster name (not EKS name), region, exact error message from console/CLI/CloudFormation.
+2. **Run `scripts/diagnose-cluster.sh`** (or `--validate` for pre-create checks).
+3. **Read the script output top-to-bottom.** For every `[FAIL]` line, note the trailing `→ references/<file>.md § <section>` pointer.
+4. **Open each referenced section.** Use the `Read` tool on the exact file path.
+5. **Present the remediation to the customer** with the finding, root cause, exact command(s), and blast radius. Cluster-level remediations (SG changes, AMI upgrade, kubeconfig overwrite, node replacement, service-linked role creation) have wider blast radius than node-level — describe the impact clearly.
+6. **Wait for explicit customer approval** before running any state-changing command.
+7. **Re-run the diagnostic** after remediation to confirm.
+
+---
+
+## Step 1: Collect information & run diagnostics
+
+Ask the customer for:
+
+- **HyperPod cluster name** (not the EKS cluster name):
+
+  ```bash
+  aws sagemaker list-clusters --region <REGION> --query 'ClusterSummaries[*].ClusterName'
+  ```
+
+- **AWS region** — e.g. `us-east-1`, `us-west-2`
+- **Error message** — the exact error from console, CLI, or CloudFormation
+
+Then run the diagnostic script:
+
+```bash
+# Diagnose an existing cluster (read-only; prints findings with references/... pointers):
+bash scripts/diagnose-cluster.sh --cluster <CLUSTER_NAME_OR_ARN> --region <REGION>
+
+# Pre-flight validation (no cluster needed — validates SGs, subnets, IAM, VPC endpoints,
+# optionally S3 lifecycle scripts and per-AZ instance-type capacity):
+bash scripts/diagnose-cluster.sh --validate --region <REGION> \
+  --sg-ids <sg-1,sg-2> --subnet-ids <sub-1,sub-2> [--iam-role <role-arn>] \
+  [--s3-uri s3://<BUCKET>/path/] [--instance-type ml.p5.48xlarge]
+```
+
+The script collects in one pass: cluster status, orchestrator type, provisioning mode, instance-group health, cluster events, VPC/SG configuration, EKS access + add-ons + aws-auth, SSM readiness, CloudWatch log availability, Slurm controller health (when applicable), dangling/orphaned node reconciliation. Issues are categorized:
+
+- **P0** — Fix immediately (blocks cluster operation)
+- **P1** — Fix soon (degraded or at-risk)
+- **P2** — Informational (review when convenient)
+
+### Output tags
+
+| Tag      | Meaning                                                                          |
+| -------- | -------------------------------------------------------------------------------- |
+| `[PASS]` | Check passed                                                                     |
+| `[FAIL]` | Problem found — counted in `CRITICAL_FAILURES` with a `→ references/...` pointer |
+| `[WARN]` | Advisory                                                                         |
+| `[INFO]` | Informational                                                                    |
+
+The script never prints remediation commands. Each `[FAIL]` entry ends with a pointer of the form `→ references/cluster-diagnostics-detail.md § <section>` (or `→ references/cluster-operations.md § <N>`). Open the referenced section with `Read` to find the remediation runbook.
+
+---
+
+## Step 2: Match signal → section
+
+**From `list-cluster-events` / error messages:**
+
+| Event / Error Message                                                               | Section                                                        |
+| ----------------------------------------------------------------------------------- | -------------------------------------------------------------- |
+| `"EFA health checks did not run successfully"`                                      | **[A: EFA Health Checks](#a-efa-health-checks)**               |
+| `"Insufficient capacity"` / `"No subnets in the capacity AZ"`                       | **[B: Capacity & AZ](#b-capacity--az)**                        |
+| `"Lifecycle scripts did not run successfully"` / `"timed out"`                      | **[C: Lifecycle Scripts](#c-lifecycle-scripts)**               |
+| `"the server has asked for the client to provide credentials"` / kubectl auth error | **[D: EKS Access](#d-eks-access--kubectl)**                    |
+| Cluster InService but no instances visible / nodes not showing                      | **[E: Cluster Provisioning](#e-cluster-provisioning)**         |
+| `"Target is not connected"` / SSM errors                                            | **[F: SSM Connectivity](#f-ssm-connectivity)**                 |
+| Node replacement not happening / `batch-replace` not working                        | **[G: Node Replacement](#g-node-replacement)**                 |
+| `"Embedded stack failed"` / CloudFormation error                                    | **[H: CloudFormation Errors](#h-cloudformation-errors)**       |
+| `"ENI limit exceeded"` / `"vCPU limit"` / service quota error                       | **[B: Capacity & AZ](#b-capacity--az)**                        |
+| `"UpdateClusterSoftware"` failed / AMI upgrade error                                | **[J: AMI & Cluster Updates](#j-ami--cluster-updates)**        |
+| Cluster stuck in `ClusterMaintenanceRollbackFailed`                                 | **[J: AMI & Cluster Updates](#j-ami--cluster-updates)**        |
+| Dangling nodes on EKS after scale-up rollback                                       | **[K: Dangling Nodes & Cleanup](#k-dangling-nodes--cleanup)**  |
+| Cluster Autoscaler stops working after HyperPod attached                            | **[L: Autoscaler Compatibility](#l-autoscaler-compatibility)** |
+
+**From symptoms:**
+
+| Symptom                                                     | Section                                                                                          |
+| ----------------------------------------------------------- | ------------------------------------------------------------------------------------------------ |
+| Cluster creation failed                                     | Run script → follow section pointer                                                              |
+| Cluster stuck in Creating/Updating/Deleting > 1 hour        | **[E: Cluster Provisioning](#e-cluster-provisioning)**                                           |
+| Cluster stuck in RollbackFailed / MaintenanceFailed         | **[J: AMI & Cluster Updates](#j-ami--cluster-updates)**                                          |
+| AMI upgrade silently fails and rolls back                   | **[J: AMI & Cluster Updates](#j-ami--cluster-updates)**                                          |
+| Cluster InService, `kubectl get nodes` returns nothing      | **[D](#d-eks-access--kubectl)** then **[E](#e-cluster-provisioning)**                            |
+| Auto-repair enabled but nodes not being replaced            | **[G: Node Replacement](#g-node-replacement)**                                                   |
+| Ran `batch-replace-cluster-nodes` but nothing happened      | **[G: Node Replacement](#g-node-replacement)**                                                   |
+| Can't SSM into nodes                                        | **[F: SSM Connectivity](#f-ssm-connectivity)**                                                   |
+| Ghost/dangling nodes visible in EKS after rollback          | **[K: Dangling Nodes & Cleanup](#k-dangling-nodes--cleanup)**                                    |
+| Cluster Autoscaler broken after HyperPod attachment         | **[L: Autoscaler Compatibility](#l-autoscaler-compatibility)**                                   |
+| Node stuck in Failed after reboot                           | **[G: Node Replacement](#g-node-replacement)**                                                   |
+| Topology labels missing on new nodes                        | **[K: Dangling Nodes & Cleanup](#k-dangling-nodes--cleanup)**                                    |
+| Need to find instance ID from Slurm node name               | **[I: Utilities](#i-utilities)**                                                                 |
+| Slow I/O, data-loading bottleneck, FSx throughput saturated | [references/cluster-operations.md § 10 Filesystem Performance](references/cluster-operations.md) |
+
+---
+
+## A: EFA Health Checks
+
+Security group missing self-referencing rules for inter-node EFA — the #1 cluster creation failure. Diagnose with the cluster script or `describe-security-groups`, then add inbound/outbound self-referencing rules plus outbound internet access to every SG used by the cluster.
+Full procedure: [references/cluster-diagnostics-detail.md § A](references/cluster-diagnostics-detail.md#a-efa-health-checks).
+
+## B: Capacity & AZ
+
+Instance type unavailable in the requested Availability Zone. Check AZ offerings with `describe-instance-type-offerings`, then try a different AZ, use Flexible Training Plans, or request reserved capacity.
+Full procedure: [references/cluster-diagnostics-detail.md § B](references/cluster-diagnostics-detail.md#b-capacity--az).
+
+## C: Lifecycle Scripts
+
+Lifecycle scripts failed or timed out during provisioning. Check CloudWatch logs under `/aws/sagemaker/Clusters/<name>/<id>` for the specific error — common causes: missing S3 VPC endpoints, IAM permission gaps, Windows line endings, instance-group name mismatches.
+Full procedure: [references/cluster-diagnostics-detail.md § C](references/cluster-diagnostics-detail.md#c-lifecycle-scripts).
+
+## D: EKS Access / kubectl
+
+IAM identity not in EKS access entries or kubeconfig not set up. Verify with `sts get-caller-identity`, check access entries and auth mode on the EKS cluster, then create an access entry with admin policy and update kubeconfig.
+Full procedure: [references/cluster-diagnostics-detail.md § D](references/cluster-diagnostics-detail.md#d-eks-access--kubectl).
+
+## E: Cluster Provisioning
+
+Cluster shows InService but instances are missing — often expected with Continuous Provisioning (EKS only), where the cluster goes InService before all nodes are created. Check cluster events and node status; failures surface as events, not cluster-level errors.
+
+**Cluster stuck in Creating/Updating/Deleting > 1 hour:** Check CloudFormation nested stacks for the real error (§ H), verify the IAM execution role has required permissions, check for capacity issues (§ B), and look at cluster events. If stuck in Deleting, check for VPC ENI dependencies. If no progress after 2 hours with no error events, escalate to AWS Support.
+Full procedure: [references/cluster-diagnostics-detail.md § E](references/cluster-diagnostics-detail.md#e-cluster-provisioning).
+
+## F: SSM Connectivity
+
+SSM session fails with `Target is not connected`. Use the `sagemaker-cluster:` target format (not raw EC2 instance ID), verify the SSM plugin is installed, and confirm the node is Running. Check IAM permissions and VPC endpoints if timeouts persist.
+Full procedure: [references/cluster-diagnostics-detail.md § F](references/cluster-diagnostics-detail.md#f-ssm-connectivity).
+
+## G: Node Replacement
+
+Auto or manual node replacement not triggering. For auto-replacement, verify `NodeRecovery` is enabled, check health agent logs and node labels/reasons, and confirm capacity. For manual recovery: reboot first, replace only if reboot fails. Cluster must be InService for `batch-replace-cluster-nodes`.
+Full procedure: [references/cluster-diagnostics-detail.md § G](references/cluster-diagnostics-detail.md#g-node-replacement).
+
+## H: CloudFormation Errors
+
+Nested stack failures produce vague `Embedded stack failed`. Drill into nested stacks via the Events tab filtered by Failed until you reach the actual non-stack resource failure. CLI alternative: `describe-stack-events --query 'StackEvents[?ResourceStatus==`CREATE_FAILED`]'`. Includes guidance for service-linked role (SLR) failures and permission boundaries.
+Full procedure: [references/cluster-diagnostics-detail.md § H](references/cluster-diagnostics-detail.md#h-cloudformation-errors).
+
+## I: Utilities
+
+Map Slurm node names (`ip-10-1-123-45`) to HyperPod instance IDs using `list-cluster-nodes` or on-node `resource_config.json`. For large clusters, use the dump utility in `references/cluster-operations.md`.
+Full procedure: [references/cluster-diagnostics-detail.md § I](references/cluster-diagnostics-detail.md#i-utilities).
+
+## J: AMI & Cluster Updates
+
+`UpdateClusterSoftware` fails silently and rolls back, or the cluster gets stuck in `ClusterMaintenanceRollbackFailed`. Common causes: lifecycle scripts incompatible with the new AMI, insufficient capacity for the rolling update, or IAM gaps. For `RollbackFailed` (non-terminal state), collect diagnostics and escalate — do NOT attempt to delete and recreate.
+Full procedure: [references/cluster-diagnostics-detail.md § J](references/cluster-diagnostics-detail.md#j-ami--cluster-updates).
+
+## K: Dangling Nodes & Cleanup
+
+After a failed scale-up or rollback, EKS may show nodes that HyperPod no longer manages — "dangling" nodes appear in `kubectl get nodes` but not in `list-cluster-nodes`. The diagnostic script flags them automatically. Topology-label gaps on new nodes typically resolve on the next reconciliation cycle.
+Full procedure: [references/cluster-diagnostics-detail.md § K](references/cluster-diagnostics-detail.md#k-dangling-nodes--cleanup).
+
+## L: Autoscaler Compatibility
+
+Cluster Autoscaler can conflict with HyperPod-managed node groups because HyperPod controls node lifecycle independently. The fix is to exclude HyperPod node groups from CAS via the node-level `cluster-autoscaler.kubernetes.io/scale-down-disabled=true` annotation (not `safe-to-evict`, which is a pod annotation), or via the `--balancing-ignore-label=sagemaker.amazonaws.com/compute-type` CAS arg.
+Full procedure: [references/cluster-diagnostics-detail.md § L](references/cluster-diagnostics-detail.md#l-autoscaler-compatibility).
+
+**Karpenter:** HyperPod nodes are not managed by Karpenter NodePools and should not conflict. If you see Karpenter disrupting HyperPod nodes, add `karpenter.sh/do-not-disrupt: "true"` to HyperPod pods, or configure a NodePool `requirements` filter that excludes nodes with `sagemaker.amazonaws.com/compute-type=hyperpod`.
+
+---
+
+## Read-only guarantee & remediation principle
+
+The scripts in this skill never mutate cluster state and never emit remediation commands. Each issue detected points at a `references/<file>.md § <section>`; open that section with `Read` to find the root cause, exact commands, verification, and blast radius. Cluster-level remediations (SG changes, AMI upgrade, kubeconfig overwrite, node replacement) have wide blast radius — always explain the impact and wait for explicit customer approval before running anything.
+
+## Prerequisites
+
+Required on the machine running the skill:
+
+- `aws` CLI v2.13+ — authenticated to the AWS account that owns the HyperPod cluster.
+- `jq` — used for JSON parsing in `--validate` mode and add-on parsing.
+- `python3` — used for safe JSON manipulation and SSM payload building.
+- `bash` 4.2+.
+
+Required for EKS cluster checks:
+
+- `kubectl` — authenticated to the EKS cluster behind the HyperPod cluster. If absent, EKS-specific checks (access entries, add-ons, aws-auth) are skipped.
+- `eks:DescribeCluster`, `eks:ListAccessEntries`, `eks:ListAddons`, `eks:DescribeAddon` in the caller's IAM.
+
+Required for Slurm controller health (SSM-based):
+
+- `session-manager-plugin`. The controller's instance role must include `AmazonSSMManagedInstanceCore`.
+
+See [references/cluster-operations.md § 4 EKS Access Control](references/cluster-operations.md) and [§ 6 SSM Target Format](references/cluster-operations.md) for setup.
+
+## Defaults
+
+- **Region**: reads `$AWS_DEFAULT_REGION`; if unset, `us-east-1`.
+- **Mode**: diagnose an existing cluster (`--cluster <NAME>`). Use `--validate` for pre-create checks on SGs / subnets / IAM.
+- **Output colors**: ANSI colors on; `--no-color` disables.
+- **Event window**: `list-cluster-events --max-results 20` (most recent). For long provisioning incidents, cross-check CloudWatch log streams (§ 7 in the script output).
+- **Node list pagination**: paginated via `--no-paginate` / `NextToken` up to 5000 nodes.
+- **SSM command timeout**: 180 seconds per controller probe with retries for throttling.
+- **Read-only**: the script NEVER modifies cluster state and NEVER prints remediation commands.
+
+## Error Handling
+
+| Failure mode                                              | Script behavior                                                                   | What to tell the customer                                              |
+| --------------------------------------------------------- | --------------------------------------------------------------------------------- | ---------------------------------------------------------------------- |
+| `aws sts get-caller-identity` fails                       | Exit 1                                                                            | "Fix AWS credentials and rerun."                                       |
+| Cluster not found                                         | Exit 1 after listing clusters in the region                                       | "Confirm the HyperPod cluster name (not the EKS name) and region."     |
+| `sagemaker:*` / `ec2:*` / `eks:*` / `logs:*` AccessDenied | Warn, add `Missing IAM permission for <API>`, continue with partial data          | "Grant the listed IAM action and rerun."                               |
+| `kubectl` absent / not authenticated                      | Skip EKS-specific checks (access entries, add-ons, aws-auth, node reconciliation) | "Install/authenticate kubectl; § D in references."                     |
+| SSM plugin absent (Slurm cluster)                         | Skip Slurm controller probe                                                       | "Install session-manager-plugin; § F in references."                   |
+| SSM `send-command` throttled                              | Retry with backoff; if still throttled, warn and continue                         | "Rerun later — script is idempotent."                                  |
+| SSM command times out (180s) on large Slurm fleets        | Return partial output, note in summary                                            | "Rerun during a quiet window or reduce sinfo scope."                   |
+| CloudWatch log group not found                            | Skip CloudWatch check, continue                                                   | "CloudWatch not configured on this cluster; see § 5 in operations.md." |
+
+Exit codes: `0` = diagnostic complete (issues may still exist — check output); `1` = cluster not found / fatal prerequisite missing / critical failures present in `--validate` mode.
+
+## IAM permissions required
+
+See [references/iam-permissions.md](references/iam-permissions.md) for the full IAM policy.
+
+## Skill delegation
+
+| Need                                                              | Use                                                    |
+| ----------------------------------------------------------------- | ------------------------------------------------------ |
+| Per-node runtime issues (GPU, disk, OOM, Slurm)                   | `hyperpod-node-debugger` skill                         |
+| SSM failure on a single node                                      | `hyperpod-node-debugger` § K                           |
+| Cluster-wide SSM outage (all nodes unreachable)                   | stay here — § F                                        |
+| Single-node EFA health-check failure post-provisioning            | `hyperpod-node-debugger` § A                           |
+| Cluster-wide EFA health-check failure at creation time            | stay here — § A                                        |
+| Cluster creation / deployment failures (CFN, capacity, lifecycle) | stay here — run `--validate` first, then `§ H / B / C` |
+| NCCL timeout / distributed training errors                        | `hyperpod-nccl` skill                                  |
+| Shell access to nodes                                             | `hyperpod-ssm` skill                                   |
+| Software version comparison across nodes                          | `hyperpod-version-checker` skill                       |
+| Diagnostic bundle for AWS Support                                 | `hyperpod-issue-report` skill                          |
+| Training performance / MFU degradation                            | `hyperpod-mfu-debugger` skill                          |
+
+## Escalate to AWS Support when
+
+1. EFA health checks fail despite all SG rules being correct.
+2. Capacity errors persist despite a valid Flexible Training Plan / ODCR.
+3. Node replacement keeps failing with no clear error in events or logs.
+4. Cluster stuck in a non-terminal state (Creating/Updating) for an extended period.
+5. CloudFormation root cause error is an internal service error.
+
+Collect diagnostics with `scripts/diagnose-cluster.sh` and `hyperpod-issue-report` before escalating.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/capacity-planning.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/capacity-planning.md
new file mode 100644
index 00000000..a1a8bf47
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/capacity-planning.md
@@ -0,0 +1,238 @@
+# Capacity Planning for HyperPod Clusters
+
+Deep-dive companion to the main [SKILL.md](../SKILL.md) § B (Capacity & AZ) and the `--validate` pre-create mode. Capacity errors are one of the most common cluster-creation failures. This reference covers how to choose the right capacity strategy, verify availability, and resolve capacity-related failures.
+
+---
+
+## Capacity Acquisition Options
+
+### 1. On-Demand Instances
+
+**Best for:** Small instance types, short-term experiments, development clusters.
+
+- No upfront commitment
+- Available immediately for common types (g5, p3)
+- **Not guaranteed** for large GPU types (p4d, p5, p5e, trn1, trn2)
+- Instances may not be allocated in physical proximity → suboptimal network topology for distributed training
+- Higher hourly cost
+
+```bash
+# Check where an instance type is available:
+aws ec2 describe-instance-type-offerings \
+  --location-type availability-zone \
+  --filters "Name=instance-type,Values=ml.p5.48xlarge" \
+  --region us-west-2 \
+  --query 'InstanceTypeOfferings[*].Location' --output table
+```
+
+### 2. Flexible Training Plans
+
+**Best for:** Medium to large workloads with predictable schedules.
+
+Query available capacity by instance type, count, and desired schedule. AWS returns available options with pricing.
+
+```bash
+# List active training plans:
+aws sagemaker list-training-plans \
+  --filters Name=Status,Value=Active \
+  --region <REGION> \
+  --query 'TrainingPlanSummaries[*].{Name:TrainingPlanName,Type:InstanceType,Count:TotalInstanceCount,AZ:AvailabilityZone,Status:Status,Start:StartTime,End:EndTime}' \
+  --output table
+```
+
+**Using with HyperPod:**
+
+```bash
+aws sagemaker create-cluster \
+  --cluster-name my-cluster \
+  --instance-groups '[{
+    "InstanceGroupName": "gpu-workers",
+    "InstanceType": "ml.p5.48xlarge",
+    "InstanceCount": 4,
+    "ExecutionRole": "arn:aws:iam::<ACCT>:role/HyperPodRole",
+    "TrainingPlanArn": "arn:aws:sagemaker:<REGION>:<ACCT>:training-plan/<PLAN_NAME>",
+    "LifeCycleConfig": {
+      "SourceS3Uri": "s3://sagemaker-lifecycle-<guid>/",
+      "OnCreate": "on_create.sh"
+    }
+  }]' \
+  --vpc-config '{"SecurityGroupIds":["sg-xxx"],"Subnets":["subnet-xxx"]}' \
+  --region <REGION>
+```
+
+**Critical:** The subnet must be in the **same AZ** as the training plan's `AvailabilityZone`.
+
+**Training Plan Status Values:** `Pending`, `Active`, `Scheduled`, `Expired`, `Failed`
+
+**Advantages:**
+
+- Guaranteed capacity for reserved period
+- Discounted pricing vs on-demand
+- Better network topology (co-located instances)
+
+**Disadvantages:**
+
+- Requires advance planning and commitment
+- Capacity locked to specific AZ
+
+### 3. Reserved Capacity (ODCR via AWS Account Team)
+
+**Best for:** Large-scale, long-term capacity needs (months+).
+
+- Contact your AWS account team or TAM
+- Best pricing for sustained usage
+- Guaranteed placement in specific AZ
+- Requires longer lead time
+
+**Verification:**
+
+```bash
+# Check reserved capacity details:
+aws sagemaker list-training-plans \
+  --region <REGION> \
+  --query 'TrainingPlanSummaries[?ReservedCapacitySummaries]'
+```
+
+**ReservedCapacitySummary fields:**
+
+- `ReservedCapacityArn`, `ReservedCapacityType` (UltraServer or Instance)
+- `InstanceType`, `TotalInstanceCount`, `AvailabilityZone`
+- `DurationHours`, `DurationMinutes`, `StartTime`, `EndTime`, `Status`
+
+---
+
+## AZ Selection Strategy
+
+### The Problem
+
+Instance type availability varies by AZ. A subnet in `us-west-2a` may have capacity, while `us-west-2c` does not. Worse, AZ names (e.g., `us-west-2a`) map to different physical zones per AWS account.
+
+### Use AZ IDs for Consistency
+
+AZ IDs (e.g., `usw2-az1`) are consistent across accounts:
+
+```bash
+# Map AZ names to IDs:
+aws ec2 describe-availability-zones --region <REGION> \
+  --query 'AvailabilityZones[*].{Name:ZoneName,ID:ZoneId,State:State}' --output table
+```
+
+When coordinating with AWS Support or account teams about reserved capacity, always use **AZ IDs** (not names).
+
+### Verify Subnet Matches Capacity AZ
+
+```bash
+# Your subnet's AZ:
+aws ec2 describe-subnets --subnet-ids <SUBNET> --region <REGION> \
+  --query 'Subnets[0].{AZ:AvailabilityZone,AZ_ID:AvailabilityZoneId}'
+
+# Instance type availability per AZ:
+aws ec2 describe-instance-type-offerings \
+  --location-type availability-zone-id \
+  --filters "Name=instance-type,Values=<TYPE>" \
+  --region <REGION> \
+  --query 'InstanceTypeOfferings[*].Location'
+```
+
+If your subnet's AZ doesn't appear in the instance type offerings list, create a new subnet in an AZ that does.
+
+---
+
+## Subnet IP Capacity
+
+GPU instances consume many network interfaces (and IPs) per instance:
+
+| Instance Type    | ENIs | IPs per ENI              | Total IPs (Slurm) | Total IPs (EKS) |
+| ---------------- | ---- | ------------------------ | ----------------- | --------------- |
+| ml.p5.48xlarge   | 32   | 1 primary + 49 secondary | ~32               | ~81             |
+| ml.p5e.48xlarge  | 32   | same                     | ~32               | ~81             |
+| ml.p4d.24xlarge  | 4    | 1 primary + 49 secondary | ~4                | ~51             |
+| ml.p4de.24xlarge | 4    | same                     | ~4                | ~51             |
+| ml.trn1.32xlarge | 8    | 1 primary + 49 secondary | ~8                | ~57             |
+| ml.trn2.48xlarge | 16   | same                     | ~16               | ~65             |
+| ml.g5.48xlarge   | 2    | 1 primary + 14 secondary | ~2                | ~15             |
+
+### Calculate Required IPs
+
+```
+Required IPs = Instance Count × IPs per Instance
+```
+
+For example: 16 × ml.p5.48xlarge on EKS = 16 × 81 = 1,296 IPs → requires at least a /21 subnet (2,048 IPs).
+
+### Recommended Subnet Sizes
+
+| Cluster Size (p5) | Orchestrator | Min Subnet CIDR              |
+| ----------------- | ------------ | ---------------------------- |
+| 4 instances       | Slurm        | /25 (128 IPs)                |
+| 4 instances       | EKS          | /24 (256 IPs, plus overhead) |
+| 16 instances      | Slurm        | /23 (512 IPs)                |
+| 16 instances      | EKS          | /21 (2,048 IPs)              |
+| 64 instances      | Slurm        | /21 (2,048 IPs)              |
+| 64 instances      | EKS          | /19 (8,192 IPs)              |
+
+**Subnet CIDRs cannot be changed after creation.** Plan for growth.
+
+```bash
+# Check current availability:
+aws ec2 describe-subnets --subnet-ids <SUBNET> --region <REGION> \
+  --query 'Subnets[0].{CIDR:CidrBlock,TotalIPs:CidrBlock,FreeIPs:AvailableIpAddressCount}'
+```
+
+---
+
+## Service Quotas
+
+Check these **before** creating a cluster:
+
+```bash
+# List SageMaker quotas (search for "cluster"):
+aws service-quotas list-service-quotas \
+  --service-code sagemaker --region <REGION> \
+  --query 'Quotas[?contains(QuotaName,`cluster`) || contains(QuotaName,`Cluster`)].{Name:QuotaName,Value:Value,Code:QuotaCode}' \
+  --output table
+```
+
+| Quota                            | Default          | What Happens If Exceeded                |
+| -------------------------------- | ---------------- | --------------------------------------- |
+| `ml.<type> for cluster usage`    | Varies           | `CreateCluster` fails with quota error  |
+| Max instances per cluster        | Account-specific | Cannot add more instance groups         |
+| Total instances across clusters  | Account-specific | Must delete existing clusters first     |
+| Max EBS volume size per instance | 16,384 GB        | `CreateCluster` fails if config exceeds |
+| VPCs per region                  | 5                | CFN VPC creation fails                  |
+| Network interfaces per region    | 5,000            | Instance provisioning fails silently    |
+| Elastic IPs per region           | 5                | NAT Gateway creation fails              |
+
+**Request quota increases proactively** — increases can take 1-3 business days.
+
+---
+
+## Troubleshooting Capacity Failures
+
+### "Insufficient capacity" Error
+
+1. Check which AZs have the instance type available (see commands above)
+2. Verify your subnet is in one of those AZs
+3. If no AZ has capacity: try a different region, instance type, or contact account team
+4. If using Training Plan: verify `TrainingPlanArn` and subnet AZ match
+
+### "No subnets in the capacity AZ" Error
+
+The cluster configuration specifies subnets, but none of them are in the AZ where AWS has capacity.
+
+Fix: Create a new subnet in the AZ where capacity exists and add it to the cluster configuration.
+
+### Cluster Stuck in "Creating" (No Progress)
+
+1. Check `list-cluster-events` for error messages
+2. If no events: likely waiting for capacity
+3. If events show failures: fix the indicated issue
+4. If stuck >1 hour with no events: contact AWS Support
+
+### Partial Provisioning (Some Nodes Running, Others Failing)
+
+This typically means capacity was available for some instances but not all.
+
+- The cluster will keep retrying if `NodeProvisioningMode=Continuous`
+- Check events for the specific instance group that's failing
+- Consider reducing `InstanceCount` or using `MinInstanceCount` for elastic scaling
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cloudformation-errors.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cloudformation-errors.md
new file mode 100644
index 00000000..4be45465
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cloudformation-errors.md
@@ -0,0 +1,148 @@
+# CloudFormation Error Reference for HyperPod Deployments
+
+Deep-dive companion to the main [SKILL.md](../SKILL.md) § H (CloudFormation Errors). When deploying HyperPod via the SageMaker console or CloudFormation templates, failures surface as `CREATE_FAILED` or `ROLLBACK_COMPLETE` at the top-level stack. The actual root cause is usually buried several levels deep in nested stacks.
+
+---
+
+## Navigating Nested Stacks
+
+### Stack Hierarchy (Console Deployments)
+
+Typical HyperPod console deployment creates this stack structure:
+
+```
+Top-Level Stack (HyperPod-<name>)
+├── NetworkStack (VPC, subnets, IGW, NAT, SG, S3 endpoint)
+├── StorageStack (FSx Lustre, optional OpenZFS)
+├── IAMStack (execution role, instance profile)
+├── S3Stack (lifecycle scripts bucket + upload)
+└── ClusterStack (AWS::SageMaker::Cluster resource)
+    └── [The cluster resource itself — most failures end here]
+```
+
+### Step-by-Step Navigation
+
+1. **CloudFormation Console** → ensure correct region → find the HyperPod stack
+2. **Status filter:** look for `CREATE_FAILED` or `ROLLBACK_COMPLETE`
+3. **Events tab** → filter by `CREATE_FAILED` → note the earliest failure timestamp
+4. **Resources tab** → find `AWS::CloudFormation::Stack` type entries with `CREATE_FAILED`
+5. **Click Physical ID** of the failed nested stack
+6. **Repeat** until reaching a stack with only leaf resources (no further `AWS::CloudFormation::Stack`)
+7. **Read Status Reason** on the failed leaf resource — this is the root cause
+
+### Tip: Find Root Cause via CLI
+
+```bash
+# List all failed events across all stacks (requires stack name or ID):
+aws cloudformation describe-stack-events \
+  --stack-name <TOP_LEVEL_STACK_NAME> \
+  --region <REGION> \
+  --query 'StackEvents[?ResourceStatus==`CREATE_FAILED`].{Time:Timestamp,Resource:LogicalResourceId,Type:ResourceType,Reason:ResourceStatusReason}' \
+  --output table
+
+# For nested stacks — get the nested stack's name from Resources tab:
+aws cloudformation describe-stack-events \
+  --stack-name <NESTED_STACK_ID> \
+  --region <REGION> \
+  --query 'StackEvents[?ResourceStatus==`CREATE_FAILED`]'
+```
+
+---
+
+## Resource Error Catalog
+
+### AWS::SageMaker::Cluster
+
+| Status Reason                                               | Root Cause                                         | Fix                                                                          |
+| ----------------------------------------------------------- | -------------------------------------------------- | ---------------------------------------------------------------------------- |
+| `Insufficient capacity in the Availability Zone`            | No on-demand instances available in AZ             | Add subnet in different AZ; use Flexible Training Plans or reserved capacity |
+| `No subnets in the capacity AZ`                             | Cluster subnet not in the AZ where capacity exists | Create subnet in the AZ where instances are available                        |
+| `EFA health checks did not run successfully`                | Security group missing self-referencing rules      | Add inbound + outbound self-ref rules on SG (protocol: All, source: self)    |
+| `Lifecycle scripts did not run successfully`                | Script error, S3 access, or timeout                | Check CloudWatch logs: `/aws/sagemaker/Clusters/<name>/<id>`                 |
+| `Instance bootstrap failed due to network misconfiguration` | VPC routing or SG issue                            | Verify NAT Gateway route, S3 VPC endpoint, SG rules                          |
+| `The security group 'sg-xxx' does not exist`                | SG ID is wrong or in different region              | Verify SG exists in the same region and VPC                                  |
+| `The subnet 'subnet-xxx' does not exist`                    | Subnet ID is wrong or in different region          | Verify subnet exists in the same region                                      |
+| `You are not authorized to perform this operation`          | Execution role missing permissions                 | Add `AmazonSageMakerClusterInstanceRolePolicy` + VPC permissions             |
+| `The maximum number of instances ... has been reached`      | Service quota exceeded                             | Request quota increase via Service Quotas console                            |
+
+### AWS::IAM::Role
+
+| Status Reason                             | Root Cause                            | Fix                                                        |
+| ----------------------------------------- | ------------------------------------- | ---------------------------------------------------------- |
+| `Cannot exceed quota for PoliciesPerRole` | Too many managed policies attached    | Consolidate inline policies; limit is 10 managed per role  |
+| `Invalid principal in policy`             | Trust policy references wrong service | Use `"Service": "sagemaker.amazonaws.com"` in trust policy |
+| `MalformedPolicyDocument`                 | JSON syntax error in inline policy    | Validate JSON; check for trailing commas, missing quotes   |
+| `EntityAlreadyExists`                     | Role name already taken               | Use unique name or import existing role                    |
+
+### AWS::EC2::VPC / Subnet / SecurityGroup
+
+| Status Reason                                        | Root Cause                                | Fix                                                          |
+| ---------------------------------------------------- | ----------------------------------------- | ------------------------------------------------------------ |
+| `The CIDR 'x.x.x.x/y' conflicts with another subnet` | Overlapping CIDR in same VPC              | Use non-overlapping CIDR blocks                              |
+| `The maximum number of VPCs has been reached`        | VPC quota per region (default: 5)         | Request VPC quota increase                                   |
+| `InvalidGroup.Duplicate`                             | SG rule already exists                    | Skip — not a real error (idempotency issue in template)      |
+| `RulesPerSecurityGroupLimitExceeded`                 | More than 60 inbound or 60 outbound rules | Consolidate rules; use CIDR ranges instead of individual IPs |
+
+### AWS::FSx::FileSystem
+
+| Status Reason                                   | Root Cause                              | Fix                                                  |
+| ----------------------------------------------- | --------------------------------------- | ---------------------------------------------------- |
+| `The subnet is not in a supported AZ`           | FSx Lustre not available in subnet's AZ | Use a subnet in an AZ that supports FSx Lustre       |
+| `The security group does not belong to the VPC` | SG and subnet in different VPCs         | Move SG or subnet to same VPC                        |
+| `Insufficient storage capacity`                 | FSx Lustre capacity exhausted in AZ     | Try different AZ or reduce storage size              |
+| `Invalid deployment type for storage type`      | Template uses incompatible FSx config   | PERSISTENT_2 requires SSD; check template parameters |
+
+### AWS::Lambda::Function (Custom Resources)
+
+| Status Reason                                    | Root Cause                           | Fix                                                       |
+| ------------------------------------------------ | ------------------------------------ | --------------------------------------------------------- |
+| `<error message from Lambda>` (Custom::Resource) | Lambda-backed custom resource failed | Find the Lambda function name → check its CloudWatch logs |
+| `Timed out`                                      | Lambda exceeded 15-minute limit      | Custom resource handler is too slow; check what it does   |
+
+**To debug Custom::Resource failures:**
+
+```bash
+# Find Lambda function name from CFN Resources tab, then:
+aws logs tail /aws/lambda/<FUNCTION_NAME> --region <REGION> --since 1h
+```
+
+---
+
+## Rolled-Back Stacks
+
+When a stack rolls back, CloudFormation deletes the resources it created. To view rolled-back stacks:
+
+1. CloudFormation Console → **Deleted** filter (top-right dropdown)
+2. Or via CLI:
+
+   ```bash
+   aws cloudformation list-stacks \
+     --stack-status-filter ROLLBACK_COMPLETE DELETE_COMPLETE \
+     --region <REGION> \
+     --query 'StackSummaries[?contains(StackName,`HyperPod`) || contains(StackName,`hyperpod`)].{Name:StackName,Status:StackStatus,Time:CreationTime}' \
+     --output table
+   ```
+
+---
+
+## CFN Template Gotchas
+
+### ThreadsPerCore
+
+`ThreadsPerCore` defaults to 1 (hyperthreading disabled) when set via console "Advanced Configuration." This makes p5.48xlarge show 96 vCPU instead of 192. Fix: set `ThreadsPerCore: 2` explicitly.
+
+Any `UpdateCluster` call via CFN **must include ThreadsPerCore** even if not originally set — omitting it resets to default.
+
+### S3 Bucket Naming
+
+The `SourceS3Uri` must match pattern `s3://sagemaker-*` per API validation. CFN templates typically create a bucket named `sagemaker-lifecycle-<guid>`.
+
+### Condition-Dependent Resources
+
+If using the reference HyperPod CFN template, some resources are conditional:
+
+- FSx OpenZFS: only created if `CreateOpenZFS=true`
+- S3 VPC Endpoint: only created if `CreateS3Endpoint=true`
+- SSM Session Document: only if `CreateSSMSessionDocument=true`
+
+A condition evaluating to `false` means the resource is skipped (not failed).
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-diagnostics-detail.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-diagnostics-detail.md
new file mode 100644
index 00000000..1c613dd4
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-diagnostics-detail.md
@@ -0,0 +1,664 @@
+# Cluster Diagnostics — Detailed Procedures
+
+This file contains the full diagnostic and fix procedures for each section referenced
+in the main [SKILL.md](../SKILL.md). Jump to a section using the anchors below.
+
+---
+
+## A: EFA Health Checks
+
+**Signals:** `"EFA health checks did not run successfully. Ensure that your VPC and security groups are properly configured before attempting to create a new cluster."`
+
+**Root cause:** Security group is missing a self-referencing rule that allows nodes to communicate with each other via EFA. This is the #1 most common cluster creation failure.
+
+### Diagnose
+
+```bash
+# The diagnostic script auto-checks SG rules. You can also run directly:
+bash scripts/diagnose-cluster.sh --cluster <CLUSTER> --region <REGION>
+
+# Or check a specific security group:
+SG=$(aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --query 'VpcConfig.SecurityGroupIds[0]' --output text)
+
+aws ec2 describe-security-groups --group-ids $SG --region <REGION> \
+  --query 'SecurityGroups[0].{Inbound:IpPermissions,Outbound:IpPermissionsEgress}' \
+  --output json
+```
+
+Look for: self-referencing rules where Source/Destination is the security group itself.
+
+### Fix
+
+Add the required rules to **every** security group used by the cluster:
+
+```bash
+SG=<security-group-id>
+REGION=<region>
+
+# Rule 1 — Inbound self-reference (required for inter-node communication)
+aws ec2 authorize-security-group-ingress --group-id $SG --region $REGION \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"'"$SG"'"}]}]'
+
+# Rule 2 — Outbound self-reference (required for EFA RDMA traffic)
+aws ec2 authorize-security-group-egress --group-id $SG --region $REGION \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"'"$SG"'"}]}]'
+
+# Rule 3 — Outbound internet (required for AWS API calls, package downloads)
+aws ec2 authorize-security-group-egress --group-id $SG --region $REGION \
+  --ip-permissions '[{"IpProtocol":"-1","IpRanges":[{"CidrIp":"0.0.0.0/0"}]}]'
+```
+
+After fixing: verify with `describe-security-groups`, ensure all nodes use the same SG, then **retry cluster creation**. See [cluster-operations.md](cluster-operations.md) § 1 for multi-SG clusters and verification details.
+
+---
+
+## B: Capacity & AZ
+
+**Signals:** `"Insufficient capacity"`, `"We currently do not have sufficient capacity in the Availability Zone you requested"`, `"Cannot provision requested instances"`, `"No subnets in the capacity AZ"`.
+
+### Diagnose
+
+```bash
+# Check which AZs have the instance type
+aws ec2 describe-instance-type-offerings \
+  --location-type availability-zone \
+  --filters "Name=instance-type,Values=<INSTANCE_TYPE>" \
+  --region <REGION> \
+  --query 'InstanceTypeOfferings[*].Location' --output table
+```
+
+### Fix
+
+1. **Try a different AZ** — add a subnet where the instance type is available
+2. **Flexible Training Plans** (recommended for p4d/p5/trn1) — `aws sagemaker search-training-plan-offerings`, then set `TrainingPlanArn` in cluster config
+3. **Reserved capacity** — contact AWS account team for large/long-term needs
+
+If using reserved capacity and still failing: verify subnet AZ matches reservation AZ. See [cluster-operations.md](cluster-operations.md) § 2 for the condensed workflow and [capacity-planning.md](capacity-planning.md) for the full strategy guide (On-Demand vs. Flexible Training Plans vs. ODCR, AZ-ID selection, subnet IP sizing per instance type, relevant service quotas).
+
+---
+
+## C: Lifecycle Scripts
+
+**Signals:** `"Lifecycle scripts did not run successfully"`, `"Lifecycle scripts execution timed out"`, cluster creation fails during provisioning.
+
+### Diagnose
+
+```bash
+# Get cluster ID for CloudWatch log group
+CLUSTER_ARN=$(aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterArn' --output text)
+CLUSTER_ID=$(echo "$CLUSTER_ARN" | cut -d/ -f2)
+LOG_GROUP="/aws/sagemaker/Clusters/<CLUSTER_NAME>/${CLUSTER_ID}"
+
+# List lifecycle log streams
+aws logs describe-log-streams \
+  --log-group-name "$LOG_GROUP" \
+  --region <REGION> \
+  --query 'logStreams[?starts_with(logStreamName,`LifecycleConfig`)].logStreamName' \
+  --output table
+
+# Read a specific log stream
+aws logs get-log-events \
+  --log-group-name "$LOG_GROUP" \
+  --log-stream-name "LifecycleConfig/<group-name>/<instance-id>" \
+  --region <REGION> \
+  --query 'events[*].message' --output text
+```
+
+### Common Errors & Fixes
+
+| Log Error                                | Root Cause                     | Fix                                                                                       |
+| ---------------------------------------- | ------------------------------ | ----------------------------------------------------------------------------------------- |
+| `Connect timeout on endpoint URL: s3://` | No S3 access from VPC          | Add S3 Gateway VPC endpoint to subnet route table                                         |
+| `AccessDenied` on S3                     | Missing IAM permissions        | Add `s3:GetObject` + `s3:ListBucket` to execution role for the lifecycle script S3 bucket |
+| Script never exits / timeout             | Infinite loop or hung command  | Add proper exit codes; test script locally; add `set -e` to fail fast                     |
+| `ASCII text, with CRLF line terminators` | Windows line endings           | Convert: `dos2unix script.sh` before uploading to S3                                      |
+| `provisioning_parameters.json mismatch`  | Instance group name mismatch   | Match instance group names exactly between lifecycle script and API call                  |
+| `command not found`                      | Missing dependency             | Check if required packages are in the AMI; install in script                              |
+| `Permission denied`                      | Missing shebang or permissions | Add `#!/bin/bash` as first line; ensure `chmod +x` before S3 upload                       |
+
+Compare scripts with latest upstream versions — see [cluster-operations.md](cluster-operations.md) § 3 for repo links, testing tips, and execution order. For the full S3 layout, `config.py` toggle reference, per-node-type detection flow, and on-node debug procedures (`/var/log/provision/`, `resource_config.json`), see [lifecycle-scripts.md](lifecycle-scripts.md).
+
+---
+
+## D: EKS Access / kubectl
+
+**Signals:** `"couldn't get current server API group list: the server has asked for the client to provide credentials"`, kubectl auth errors, `kubectl get nodes` returns nothing or errors.
+
+**Root cause:** IAM identity not configured in EKS access entries, or kubeconfig not set up.
+
+### Diagnose
+
+```bash
+# Step 1: Check your IAM identity
+aws sts get-caller-identity
+
+# Step 2: Get EKS cluster name from HyperPod
+EKS_ARN=$(aws sagemaker describe-cluster --cluster-name <HYPERPOD_CLUSTER> --region <REGION> \
+  --query 'Orchestrator.Eks.ClusterArn' --output text)
+EKS_NAME=$(echo $EKS_ARN | awk -F'/' '{print $NF}')
+echo "EKS cluster: $EKS_NAME"
+
+# Step 3: Check existing access entries
+aws eks list-access-entries --cluster-name $EKS_NAME --region <REGION>
+
+# Step 4: Check auth mode
+aws eks describe-cluster --name $EKS_NAME --region <REGION> \
+  --query 'cluster.accessConfig.authenticationMode' --output text
+# Must be API or API_AND_CONFIG_MAP — not CONFIG_MAP
+```
+
+### Fix
+
+```bash
+# Step 1: Add your IAM identity to EKS access entries
+MY_ARN=$(aws sts get-caller-identity --query 'Arn' --output text)
+
+# For IAM users:
+aws eks create-access-entry \
+  --cluster-name $EKS_NAME \
+  --region <REGION> \
+  --principal-arn $MY_ARN
+
+# Step 2: Associate admin policy (for full cluster access)
+aws eks associate-access-policy \
+  --cluster-name $EKS_NAME \
+  --region <REGION> \
+  --principal-arn $MY_ARN \
+  --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \
+  --access-scope '{"type": "cluster"}'
+
+# Step 3: Update kubeconfig
+aws eks update-kubeconfig --name $EKS_NAME --region <REGION>
+
+# Step 4: Test access
+kubectl get nodes
+kubectl get pods -A
+```
+
+**If auth mode is CONFIG_MAP** (not supported by HyperPod): change to `API_AND_CONFIG_MAP` via `aws eks update-cluster-config --name $EKS_NAME --access-config authenticationMode=API_AND_CONFIG_MAP`. For IAM roles, use the role ARN (not session ARN). See [cluster-operations.md](cluster-operations.md) § 4 for auth mode details.
+
+---
+
+## E: Cluster Provisioning
+
+**Signals:** Cluster shows `InService` but instances are not visible, `kubectl get nodes` returns no nodes, `list-cluster-nodes` shows fewer nodes than expected.
+
+**Root cause:** This is often expected behavior with **Continuous Provisioning mode** (EKS only). In this mode the cluster transitions to InService before all instances are created. Instance creation happens asynchronously and failures are reported via cluster events, not as cluster creation failures.
+
+### Diagnose
+
+```bash
+# Step 1: Check cluster status and provisioning mode
+aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --query '{Status:ClusterStatus,Groups:InstanceGroups[*].{Name:InstanceGroupName,Count:CurrentCount,Target:InstanceCount,Status:InstanceGroupStatus}}' \
+  --output table
+
+# Step 2: Check cluster events (EKS — primary source of truth)
+aws sagemaker list-cluster-events --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterEventSummaries[*].{Time:EventTime,Type:EventType,Message:Message}' \
+  --output table
+
+# Step 3: Check individual node status
+aws sagemaker list-cluster-nodes --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,Group:InstanceGroupName,Status:InstanceStatus.Status}' \
+  --output table
+```
+
+### Common Scenarios
+
+| Observation                                               | Cause                                     | Action                                               |
+| --------------------------------------------------------- | ----------------------------------------- | ---------------------------------------------------- |
+| CurrentCount < InstanceCount, events show provisioning    | Continuous provisioning — still creating  | Wait; monitor events                                 |
+| Events show `"Insufficient capacity"`                     | No capacity in AZ                         | See **[B: Capacity & AZ](#b-capacity--az)**          |
+| Events show lifecycle script failure                      | Script error during instance provisioning | See **[C: Lifecycle Scripts](#c-lifecycle-scripts)** |
+| Events show `"EFA health checks"`                         | SG misconfiguration                       | See **[A: EFA Health Checks](#a-efa-health-checks)** |
+| No events, no nodes                                       | Cluster may be stuck                      | Check CloudFormation stack; contact Support          |
+| Nodes in `list-cluster-nodes` but not `kubectl get nodes` | EKS registration issue                    | Check lifecycle script logs, kubelet status via SSM  |
+
+See [cluster-operations.md](cluster-operations.md) § 5 for Continuous Provisioning details (EKS only).
+
+---
+
+## F: SSM Connectivity
+
+**Signals:** `"Target is not connected"`, SSM session fails to start, cannot access nodes.
+
+### Diagnose
+
+```bash
+# Step 1: Verify SSM plugin installed
+session-manager-plugin --version
+
+# Step 2: Get the correct target format
+# Target format: sagemaker-cluster:<cluster-id>_<instance-group-name>-<instance-id>
+# Do NOT use the EC2 instance ID directly!
+
+CLUSTER_INFO=$(aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION>)
+CLUSTER_ID=$(echo "$CLUSTER_INFO" | python3 -c "import sys,json; print(json.load(sys.stdin)['ClusterArn'].split('/')[-1])")
+
+aws sagemaker list-cluster-nodes --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,Group:InstanceGroupName,Status:InstanceStatus.Status}' \
+  --output table
+
+# Step 3: Construct target and test
+TARGET="sagemaker-cluster:${CLUSTER_ID}_<group-name>-<instance-id>"
+aws ssm start-session --target "$TARGET" --region <REGION>
+```
+
+### Required IAM Permissions for SSM
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [{
+    "Effect": "Allow",
+    "Action": [
+      "sagemaker:DescribeCluster",
+      "sagemaker:ListClusterNodes",
+      "ssm:StartSession",
+      "ssm:TerminateSession"
+    ],
+    "Resource": "*"
+  }]
+}
+```
+
+### Common Errors & Fixes
+
+| Error                                   | Root Cause                                             | Fix                                                                                                                                         |
+| --------------------------------------- | ------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `SessionManagerPlugin is not found`     | SSM plugin not installed                               | Install: `brew install --cask session-manager-plugin` (macOS) or download from AWS docs (Linux). Verify: `session-manager-plugin --version` |
+| `Target is not connected`               | Wrong target format, wrong region, or node not running | Use `sagemaker-cluster:` prefix (CLUSTER_ID is the ARN suffix, not cluster name); verify region; check node is `Running`                    |
+| `InvalidTarget` / `ValidationException` | Malformed target string                                | Format must be `sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>` exactly                                                               |
+| `Access denied`                         | Missing IAM permissions                                | Need `ssm:StartSession`, `sagemaker:DescribeCluster`, `sagemaker:ListClusterNodes` — see IAM policy above                                   |
+| Connection timeout                      | SSM agent unreachable                                  | Check VPC endpoints (SSM, SSMMessages, EC2Messages) exist in the cluster VPC; verify node is `Running`                                      |
+
+SSM access is **identical for both EKS and Slurm** clusters — same target format, same plugin, same IAM permissions, same VPC endpoints.
+
+For SSH-over-SSM setup, see [cluster-operations.md](cluster-operations.md) § 6.
+
+---
+
+## G: Node Replacement
+
+**Signals:** Auto-replacement not triggering, `batch-replace-cluster-nodes` not working, node stuck in unhealthy state.
+
+### G.1: Auto-Replacement Not Working
+
+```bash
+# Step 1: Check if NodeRecovery is enabled per instance group
+aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --query 'InstanceGroups[*].{Group:InstanceGroupName,Recovery:NodeRecovery}' --output table
+
+# Step 1a: If NodeRecovery=None, enable it with update-cluster. All required fields
+# for each instance group must be supplied (InstanceType/Count/LifeCycleConfig/ExecutionRole) —
+# derive them from describe-cluster output first.
+aws sagemaker update-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --instance-groups '[{"InstanceGroupName":"<G>","InstanceType":"ml.p5.48xlarge",
+    "InstanceCount":<N>,"ThreadsPerCore":2,
+    "LifeCycleConfig":{"SourceS3Uri":"<URI>","OnCreate":"<SCRIPT>"},
+    "ExecutionRole":"<ROLE>",
+    "OnStartDeepHealthChecks":["InstanceStress","InstanceConnectivity"],
+    "NodeRecovery":"Automatic"}]'
+
+# Step 2: Check cluster events for replacement activity (EKS only — cluster events
+# are not available for Slurm as of January 2026)
+aws sagemaker list-cluster-events --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterEventSummaries[?contains(Message,`replace`) || contains(Message,`reboot`) || contains(Message,`hardware`) || contains(Message,`recovery`)]' \
+  --output table
+
+# Step 3: Check health monitoring agent logs. Log-stream name pattern is
+# SagemakerHealthMonitoringAgent/<node-group>/<instance-id>, e.g.
+# SagemakerHealthMonitoringAgent/group-g5-8x/i-0aa017cbf6c240f3f
+CLUSTER_ID=$(aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterArn' --output text | cut -d/ -f2)
+aws logs describe-log-streams \
+  --log-group-name "/aws/sagemaker/Clusters/<CLUSTER>/${CLUSTER_ID}" \
+  --region <REGION> \
+  --query 'logStreams[?starts_with(logStreamName,`SagemakerHealthMonitoringAgent`)].logStreamName' \
+  --output table
+
+# Step 4: Check EKS node health labels. The label VALUES below indicate the action
+# the agent has already decided on — if they are missing, the agent hasn't detected
+# a resiliency-worthy issue yet.
+#   sagemaker.amazonaws.com/node-health-status=UnschedulablePendingReplacement  → marked for replacement
+#   sagemaker.amazonaws.com/node-health-status=UnschedulablePendingReboot       → marked for reboot
+# Docs: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-eks-resiliency-node-labels.html
+kubectl get nodes --show-labels
+kubectl describe node <NODE>
+
+# Step 5: Check Slurm node reason (Slurm)
+# Auto-repair triggers ONLY if node reason is exactly "Action:Reboot" or "Action:Replace"
+sinfo -o "%N %T %30E"
+
+# Step 6: Check lifecycle-script CW logs (separate log-stream family from HMA).
+# If the lifecycle script fails on the new instance, the replacement can't complete
+# and auto-recovery stalls. Stream pattern: LifecycleConfig/<node-group>/<instance-id>
+aws logs describe-log-streams \
+  --log-group-name "/aws/sagemaker/Clusters/<CLUSTER>/${CLUSTER_ID}" \
+  --region <REGION> \
+  --query 'logStreams[?starts_with(logStreamName,`LifecycleConfig`)].logStreamName' \
+  --output table
+```
+
+**Common blockers:** `NodeRecovery=None` (enable via Step 1a), health agent hasn't detected (wait for next cycle or trigger manually), lifecycle script failing on new instance (see Step 6), no capacity (see [B](#b-capacity--az)), cluster not InService, Slurm reason not `Action:Reboot`/`Action:Replace`. See [cluster-operations.md](cluster-operations.md) § 7 for full decision tree.
+
+**Manual fallback** — if auto-recovery won't trigger and you need to move now, reboot first (less disruptive), replace only if reboot doesn't clear the fault. See [G.2](#g2-manual-replacement-not-working) for the full manual flow.
+
+```bash
+aws sagemaker batch-reboot-cluster-nodes  --cluster-name <CLUSTER> --region <REGION> --node-ids '["<INSTANCE_ID>"]'
+aws sagemaker batch-replace-cluster-nodes --cluster-name <CLUSTER> --region <REGION> --node-ids '["<INSTANCE_ID>"]'
+```
+
+### G.2: Manual Replacement Not Working
+
+```bash
+# Step 1: Verify instance ID is correct and belongs to cluster
+aws sagemaker list-cluster-nodes --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,Group:InstanceGroupName,Status:InstanceStatus.Status}' \
+  --output table
+
+# Step 2: Check cluster state (must be InService)
+aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterStatus' --output text
+
+# Step 3: Reboot first — less disruptive; preserves instance volumes.
+aws sagemaker batch-reboot-cluster-nodes \
+  --cluster-name <CLUSTER> \
+  --region <REGION> \
+  --node-ids '["<INSTANCE_ID>"]'
+
+# Step 4: Monitor progress.
+aws sagemaker list-cluster-events --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterEventSummaries[0:5].{Time:EventTime,Message:Message}' --output table
+
+# Step 5: ONLY if reboot did not clear the fault, replace the node.
+#   IMPORTANT: replacement destroys all instance-local volumes. Back up any
+#   state to S3 or FSx before running this.
+aws sagemaker batch-replace-cluster-nodes \
+  --cluster-name <CLUSTER> \
+  --region <REGION> \
+  --node-ids '["<INSTANCE_ID>"]'
+```
+
+If the command succeeds but node stays, check lifecycle script CW logs for the new instance. If capacity error, see [B](#b-capacity--az). Cluster must be `InService`. Use `batch-reboot` / `batch-replace` over legacy methods (Slurm reason / K8s labels); always reboot first.
+
+---
+
+## H: CloudFormation Errors
+
+**Signals:** `"Embedded stack failed"`, CloudFormation stack in `CREATE_FAILED` or `ROLLBACK_COMPLETE`, vague error from management console.
+
+### Navigate to Root Cause
+
+1. Go to **CloudFormation console** -> correct region
+2. Find the HyperPod stack (status `CREATE_FAILED` or `ROLLBACK_COMPLETE`)
+3. Click **Events** tab -> filter by "Failed" status
+4. If the error is `"Embedded stack failed"`, click **Resources** tab
+5. Find resources of type `AWS::CloudFormation::Stack` with `CREATE_FAILED`
+6. Click the Physical ID -> opens the nested stack
+7. Repeat steps 3-6 until you reach a non-stack resource (the actual failure)
+8. Read the **Status reason** — this is the actionable error message
+
+### Common Root Cause Resources
+
+| Failed Resource Type          | Common Errors                                                          |
+| ----------------------------- | ---------------------------------------------------------------------- |
+| `AWS::SageMaker::Cluster`     | Capacity errors, subnet issues, SG problems, lifecycle script failures |
+| `AWS::IAM::Role`              | Permission errors, trust relationship issues                           |
+| `AWS::IAM::ServiceLinkedRole` | **Service-linked role creation denied** — see below                    |
+| `AWS::Lambda::Function`       | Execution errors, timeout                                              |
+| `AWS::EC2::VPC`               | CIDR conflicts, quota limits                                           |
+| `Custom::Resource`            | Lambda-backed error — check Lambda CloudWatch logs for details         |
+
+**CLI alternative:** Use `aws cloudformation describe-stack-events --stack-name <STACK> --query 'StackEvents[?ResourceStatus==\`CREATE_FAILED\`]'`and drill into nested stacks. Start with the **earliest** failure (later ones are often cascading). For`Custom::Resource`failures, check the Lambda's CloudWatch logs. See [cloudformation-errors.md](cloudformation-errors.md) for the full resource-by-resource error catalog (`AWS::SageMaker::Cluster`,`AWS::IAM::Role`,`AWS::EC2::VPC/Subnet/SecurityGroup`,`AWS::FSx::FileSystem`,`AWS::Lambda::Function`/`Custom::Resource`, stack-hierarchy navigation, CFN template gotchas like`ThreadsPerCore`).
+
+### Service-Linked Role (SLR) failures
+
+SageMaker HyperPod requires the `AWSServiceRoleForAmazonSageMakerHyperPod` service-linked role in the account. In enterprise accounts, SCPs or permission boundaries often block automatic SLR creation.
+
+**Symptoms:**
+
+- CloudFormation `CREATE_FAILED` on an `AWS::IAM::ServiceLinkedRole` resource
+- Error message: `"Service role AWSServiceRoleForAmazonSageMakerHyperPod could not be created"`
+- Error message: `"User ... is not authorized to perform: iam:CreateServiceLinkedRole"`
+
+**Fix:**
+
+```bash
+# Manual creation — requires iam:CreateServiceLinkedRole
+aws iam create-service-linked-role --aws-service-name sagemaker.amazonaws.com
+
+# Verify
+aws iam get-role --role-name AWSServiceRoleForAmazonSageMakerHyperPod
+```
+
+If the caller lacks `iam:CreateServiceLinkedRole` due to an SCP, have an account admin run the command, or request the SCP be adjusted to allow this specific service-linked role.
+
+### IAM Permission Boundary Denials
+
+Enterprise accounts often attach permission boundaries to roles. Even if the role's inline/attached policy grants a permission, the boundary can deny it.
+
+**Symptoms:**
+
+- `"User ... is not authorized to perform: ..."` — but the inline policy clearly grants the action
+- Cluster provisioning fails on an IAM action that should succeed
+- `DescribeRole` shows `PermissionsBoundary` field is set
+
+**Diagnosis:**
+
+```bash
+ROLE_NAME=$(aws sagemaker describe-cluster --cluster-name <C> --region <R> \
+  --query 'Orchestrator.Eks.ExecutionRoleArn' --output text | awk -F/ '{print $NF}')
+aws iam get-role --role-name "$ROLE_NAME" \
+  --query 'Role.PermissionsBoundary'
+```
+
+If `PermissionsBoundary` is non-null, inspect the attached policy — denied actions here override any grant elsewhere.
+
+### S3 Bucket Access for Lifecycle Scripts
+
+Lifecycle scripts are pulled from S3 at node boot. Common failure modes:
+
+| Symptom                    | Cause                                                   | Fix                                                                              |
+| -------------------------- | ------------------------------------------------------- | -------------------------------------------------------------------------------- |
+| `AccessDenied` in node log | IAM role lacks `s3:GetObject`                           | Add `s3:GetObject` to IAM role; verify resource ARN matches the lifecycle bucket |
+| `KMS AccessDenied`         | Bucket uses KMS encryption and role lacks `kms:Decrypt` | Add `kms:Decrypt` on the bucket's KMS key; ensure key policy allows the role     |
+| Timeout, no error          | Private subnet without S3 Gateway endpoint              | Add S3 Gateway VPC endpoint to the cluster's VPC                                 |
+| Bucket in different region | Cross-region S3 calls add latency, may time out         | Move lifecycle script to a bucket in the cluster's region                        |
+| Cross-account bucket       | Bucket policy doesn't grant the cluster account         | Add `Principal: arn:aws:iam::<cluster-account>:root` to bucket policy            |
+
+### Service Quotas
+
+HyperPod is gated by several quotas beyond the instance type itself:
+
+```bash
+REGION=<region>
+
+# HyperPod instance-type quotas
+aws service-quotas list-service-quotas --service-code sagemaker --region "$REGION" \
+  --query "Quotas[?contains(QuotaName,'HyperPod')].[QuotaName,Value]" --output table
+
+# EFA per-instance quota (if applicable)
+aws service-quotas list-service-quotas --service-code ec2 --region "$REGION" \
+  --query "Quotas[?contains(QuotaName,'EFA')].[QuotaName,Value]" --output table
+
+# EIP quota (Elastic IP — relevant if you use internet gateways for NAT)
+aws service-quotas get-service-quota --service-code ec2 --region "$REGION" \
+  --quota-code L-0263D0A3 --query 'Quota.Value' 2>/dev/null
+
+# VPC quotas (VPCs per region, subnets per VPC)
+aws service-quotas list-service-quotas --service-code vpc --region "$REGION" \
+  --query "Quotas[?Value<\`200\`].[QuotaName,Value]" --output table
+```
+
+If a quota is insufficient, file a service quota increase request at least 3 business days before you need the capacity.
+
+### Cluster in FAILED Terminal State
+
+If `ClusterStatus` is `Failed`, the cluster cannot be updated. Options:
+
+1. Collect diagnostics: `bash scripts/diagnose-cluster.sh --cluster <C> --region <R>` + CloudFormation events from Section H above
+2. Delete the cluster: `aws sagemaker delete-cluster --cluster-name <C> --region <R>`
+3. Fix the root cause (usually IAM/VPC/SG config) based on diagnostics
+4. Recreate the cluster
+
+Do not attempt to delete-and-recreate if the cluster has active workloads unless you have migrated them. The delete is destructive.
+
+### Multi-AZ Considerations for EFA
+
+**EFA does not work across AZs.** If your cluster spans multiple AZs, NCCL falls back to TCP for cross-AZ traffic — silently, with ~100x higher latency. For EFA-accelerated training:
+
+- Keep all training instance groups in a single AZ
+- Use `describe-instance-type-offerings` to find an AZ with your instance type available
+- For multi-AZ, use separate clusters per AZ and use a higher-level orchestrator to coordinate
+
+The diagnose script surfaces AZ count; any cluster spanning >1 AZ for GPU training should be flagged for review.
+
+---
+
+## I: Utilities
+
+### Find Instance ID from Slurm Node Name
+
+Slurm nodes use private IP names (e.g., `ip-10-1-123-45`). Quick lookup:
+
+```bash
+# Option 1 — HyperPod APIs (works from anywhere)
+aws sagemaker list-cluster-nodes --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,DNS:PrivateDnsHostname,Group:InstanceGroupName}' \
+  --output table
+
+# Option 2 — On head node
+IP=$(echo "ip-10-1-123-45" | sed 's/ip-//; s/-/./g')
+sudo cat /opt/ml/config/resource_config.json | jq | grep -A 3 "$IP"
+```
+
+For large clusters, use `dump_cluster_nodes_info.py` — see [cluster-operations.md](cluster-operations.md) § 8.
+
+---
+
+## J: AMI & Cluster Updates
+
+`UpdateClusterSoftware` fails silently and rolls back, or the cluster gets stuck in `ClusterMaintenanceRollbackFailed`. Common causes: lifecycle scripts incompatible with the new AMI, insufficient capacity for the rolling update, IAM permission gaps.
+
+### Diagnose
+
+```bash
+aws sagemaker list-cluster-events --cluster-name <NAME> --region <REGION> \
+  --query 'ClusterEventSummaries[?contains(Message, `Update`) || contains(Message, `Rollback`)]'
+
+aws sagemaker describe-cluster --cluster-name <NAME> --region <REGION> \
+  --query '{Status:ClusterStatus,FailureMsg:FailureMessage}'
+```
+
+Also check per-instance-group lifecycle CloudWatch logs on the nodes that were rolled over:
+
+```bash
+aws logs describe-log-streams \
+  --log-group-name "/aws/sagemaker/Clusters/<NAME>/<CLUSTER_ID>" \
+  --region <REGION>
+```
+
+### Remediation decisions
+
+- **Rolls back silently on new AMI:** lifecycle script failed on the new AMI. Fix the script (test on a single instance group first), then retry `UpdateClusterSoftware`.
+- **Stuck in `ClusterMaintenanceRollbackFailed`:** non-terminal state that requires AWS Support intervention. Collect the full diagnostic output and escalate. Do not delete and recreate if the cluster has active nodes.
+- **Insufficient capacity during rolling update:** pause the update, use Flexible Training Plans / ODCR for the rolling-capacity pool, then retry.
+- **Verify rollout strategy:** `RollingUpdatePolicy.MaximumBatchSize` should be small for first-time upgrades so one bad instance group does not trigger a full rollback.
+
+### Blue/green alternative
+
+For large fleets, a blue/green rollout via a second instance group reduces blast radius: create a new instance group on the new AMI, drain the old, validate, delete the old. This side-steps rolling-update rollback entirely.
+
+---
+
+## K: Dangling Nodes & Cleanup
+
+After a failed scale-up or rollback, EKS may show nodes that HyperPod no longer manages. These "dangling" nodes appear in `kubectl get nodes` but not in `list-cluster-nodes`. The inverse — HyperPod-only nodes not registered in EKS — usually indicates kubelet or bootstrap failure on those instances.
+
+### Diagnose
+
+```bash
+kubectl get nodes -l sagemaker.amazonaws.com/compute-type=hyperpod -o name | sort > /tmp/eks-nodes.txt
+aws sagemaker list-cluster-nodes --cluster-name <NAME> --region <REGION> \
+  --query 'ClusterNodeSummaries[*].InstanceId' --output text | tr '\t' '\n' | sort > /tmp/hp-nodes.txt
+
+# EKS-only (dangling):
+comm -13 /tmp/eks-nodes.txt /tmp/hp-nodes.txt
+
+# HyperPod-only (orphaned — kubelet never registered):
+comm -23 /tmp/hp-nodes.txt /tmp/eks-nodes.txt
+```
+
+The diagnostic script runs this comparison automatically and prints the two lists.
+
+### Remediation
+
+- **Dangling EKS node (no matching HyperPod instance):** safe to delete from EKS after verifying the EC2 instance is terminated or no longer part of the cluster:
+
+  ```bash
+  # Confirm the instance is not in the HyperPod cluster:
+  aws ec2 describe-instances --instance-ids <IID> --region <REGION> \
+    --query 'Reservations[0].Instances[0].State.Name'
+
+  # Remove the ghost node from EKS:
+  kubectl delete node <NODE_NAME>
+  ```
+
+- **Orphaned HyperPod node (not in EKS):** the kubelet did not register. Triage with the `hyperpod-node-debugger` skill — usual causes are IAM role missing on the instance, VPC endpoint missing, or lifecycle script failure.
+
+### Topology labels missing
+
+New nodes may lack `topology.kubernetes.io/zone` if the HyperPod node-labeling controller didn't run. Check the `aws-hyperpod` namespace for health-monitoring-agent pods and their logs:
+
+```bash
+kubectl -n aws-hyperpod get pods
+kubectl -n aws-hyperpod logs -l app.kubernetes.io/name=health-monitoring-agent --tail=100
+```
+
+Labels typically populate on the next reconciliation cycle (a few minutes).
+
+---
+
+## L: Autoscaler Compatibility
+
+Cluster Autoscaler can conflict with HyperPod-managed node groups because HyperPod controls node lifecycle independently. The autoscaler tries to scale down HyperPod nodes it considers underutilized; HyperPod blocks the operation and the autoscaler stalls.
+
+### Fix (Cluster Autoscaler)
+
+Two options — use whichever fits the existing CAS deployment:
+
+1. **Node-level annotation** (prevents CAS from scaling the node down):
+
+   ```bash
+   kubectl annotate node <hyperpod-node> \
+     cluster-autoscaler.kubernetes.io/scale-down-disabled=true
+   ```
+
+   **Important:** `safe-to-evict` is a **pod** annotation, not a node annotation — do not apply it to nodes. On nodes, use `scale-down-disabled`.
+
+2. **CAS deployment config** — ignore HyperPod nodes via the compute-type label:
+
+   ```yaml
+   # Cluster Autoscaler deployment args:
+   --balancing-ignore-label=sagemaker.amazonaws.com/compute-type
+   ```
+
+   Or use `--node-group-auto-discovery` with tag filters that exclude HyperPod ASGs.
+
+### Karpenter
+
+HyperPod nodes are not managed by Karpenter NodePools and should not conflict. If Karpenter attempts to disrupt HyperPod nodes:
+
+- Add `karpenter.sh/do-not-disrupt: "true"` to HyperPod training pods, or
+- Configure the NodePool `requirements` to exclude `sagemaker.amazonaws.com/compute-type=hyperpod`.
+
+### Verify
+
+```bash
+# CAS should skip HyperPod nodes from now on:
+kubectl -n kube-system logs -l app=cluster-autoscaler --tail=200 | grep -iE 'hyperpod|skip|ignore'
+```
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-operations.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-operations.md
new file mode 100644
index 00000000..b23ac8b7
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-operations.md
@@ -0,0 +1,619 @@
+# Cluster Operations Reference
+
+Detailed operational procedures, decision trees, and background context for each
+issue type covered by the hyperpod-cluster-debugger skill. See SKILL.md for
+quick-reference workflow.
+
+---
+
+## 1. EFA Security Group Configuration (Deep Dive)
+
+### Why Self-Referencing Rules Are Required
+
+HyperPod uses Elastic Fabric Adapter (EFA) for high-bandwidth, low-latency
+inter-node communication during distributed training. EFA requires:
+
+- **Inbound self-ref**: Nodes receive RDMA and TCP traffic from other nodes in the same SG
+- **Outbound self-ref**: Nodes send RDMA and TCP traffic to other nodes in the same SG
+- **Outbound 0.0.0.0/0**: Nodes reach AWS API endpoints (SageMaker, S3, CloudWatch, SSM)
+
+Without the outbound self-referencing rule, the EFA health check performed during
+cluster creation will fail — even if nodes could otherwise communicate via TCP.
+
+### EFA Health Check Timing
+
+The EFA health check runs during instance provisioning, **before** lifecycle scripts
+execute. This means:
+
+- If EFA health check fails, lifecycle scripts never run
+- CloudWatch logs may be empty (scripts didn't execute)
+- The cluster event will explicitly say "EFA health checks did not run successfully"
+
+### Multi-SG Clusters
+
+If a cluster uses multiple security groups, **all** SGs must have self-referencing
+rules. AWS applies SGs as a union for inbound and intersection for outbound —
+meaning a restrictive SG can block EFA even if another SG allows it.
+
+```bash
+# List all SGs for a cluster
+aws sagemaker describe-cluster --cluster-name <C> --region <R> \
+  --query 'VpcConfig.SecurityGroupIds' --output json
+
+# Check each one
+for SG in $(aws sagemaker describe-cluster --cluster-name <C> --region <R> \
+  --query 'VpcConfig.SecurityGroupIds[]' --output text); do
+  echo "=== $SG ==="
+  aws ec2 describe-security-groups --group-ids $SG --region <R> \
+    --query 'SecurityGroups[0].{In:IpPermissions,Out:IpPermissionsEgress}'
+done
+```
+
+---
+
+## 2. Capacity Management
+
+### Instance Type Availability by AZ
+
+Not all instance types are available in all AZs. GPU instances (p4d, p5, trn1)
+are particularly constrained. Always verify before creating a cluster:
+
+```bash
+aws ec2 describe-instance-type-offerings \
+  --location-type availability-zone \
+  --filters "Name=instance-type,Values=ml.p5.48xlarge" \
+  --region us-west-2 \
+  --query 'InstanceTypeOfferings[*].Location' --output table
+```
+
+### Flexible Training Plans Workflow
+
+1. **Search** for available capacity:
+
+   ```bash
+   aws sagemaker search-training-plan-offerings \
+     --instance-type ml.p5.48xlarge \
+     --instance-count 16 \
+     --duration-hours 720 \
+     --region <REGION>
+   ```
+
+2. **Purchase** the training plan (returns a TrainingPlanArn)
+
+3. **Reference** in cluster config:
+   - Set `TrainingPlanArn` in the instance group configuration
+   - Ensure the subnet AZ matches the capacity reservation AZ
+
+### Common Capacity Error Patterns
+
+| Error                                  | Meaning                                           | Fix                                           |
+| -------------------------------------- | ------------------------------------------------- | --------------------------------------------- |
+| `Insufficient capacity`                | No instances available in the AZ                  | Try different AZ; use Flexible Training Plans |
+| `No subnets in the capacity AZ`        | Subnet doesn't cover the AZ where capacity exists | Add subnet in correct AZ                      |
+| `Cannot provision requested instances` | Instance type not available                       | Check AZ availability; contact account team   |
+| Capacity error despite TrainingPlanArn | AZ mismatch between plan and subnet               | Verify subnet AZ matches plan AZ              |
+
+---
+
+## 3. Lifecycle Script Debugging
+
+### Log Locations
+
+| Log Type                   | Location                                                                                 | When Available                |
+| -------------------------- | ---------------------------------------------------------------------------------------- | ----------------------------- |
+| Lifecycle script output    | CloudWatch: `/aws/sagemaker/Clusters/<name>/<id>` → `LifecycleConfig/<group>/<instance>` | After script starts executing |
+| Provisioning log (on node) | `/var/log/provision/provisioning.log` (via SSM)                                          | After node boots              |
+| Health monitoring          | CloudWatch: same log group → `SagemakerHealthMonitoringAgent/<group>/<instance>`         | After agent starts            |
+
+### Lifecycle Script Execution Order
+
+1. Node boots and gets base AMI
+2. EFA health check runs (if applicable)
+3. Node metadata is written to `/opt/ml/config/resource_config.json`
+4. Lifecycle script is downloaded from S3
+5. Script executes as root
+6. If script exits 0, node joins cluster
+7. If script exits non-zero or times out, provisioning fails
+
+### Testing Lifecycle Scripts Locally
+
+Before deploying, validate scripts:
+
+```bash
+# Check for Windows line endings
+file lifecycle_script.sh
+# Should say "ASCII text" NOT "ASCII text, with CRLF line terminators"
+
+# Convert if needed
+dos2unix lifecycle_script.sh
+
+# Verify shebang
+head -1 lifecycle_script.sh
+# Should be: #!/bin/bash
+
+# Syntax check
+bash -n lifecycle_script.sh
+
+# Check for common issues
+shellcheck lifecycle_script.sh 2>/dev/null || true
+```
+
+### Default Lifecycle Script Repositories
+
+Always compare against the latest upstream versions:
+
+- **EKS**: `https://github.com/aws-samples/awsome-distributed-training/tree/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config`
+- **Slurm**: `https://github.com/aws-samples/awsome-distributed-training/tree/main/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config`
+
+---
+
+## 4. EKS Access Control
+
+### Authentication Modes
+
+| Mode                 | Description                    | HyperPod Support        |
+| -------------------- | ------------------------------ | ----------------------- |
+| `CONFIG_MAP`         | Legacy aws-auth ConfigMap only | **Not supported**       |
+| `API`                | IAM access entries only        | Supported               |
+| `API_AND_CONFIG_MAP` | Both methods                   | Supported (recommended) |
+
+### Access Entry Types
+
+| Policy                        | Scope        | Use Case                       |
+| ----------------------------- | ------------ | ------------------------------ |
+| `AmazonEKSClusterAdminPolicy` | Cluster-wide | Full admin access (debugging)  |
+| `AmazonEKSAdminPolicy`        | Namespace    | Namespace admin (multi-tenant) |
+| `AmazonEKSEditPolicy`         | Namespace    | Read/write workloads           |
+| `AmazonEKSViewPolicy`         | Namespace    | Read-only                      |
+
+### Troubleshooting kubectl Auth
+
+```bash
+# 1. Verify identity
+aws sts get-caller-identity
+
+# 2. Check kubeconfig context
+kubectl config current-context
+kubectl config view --minify
+
+# 3. Verify EKS API reachability
+kubectl cluster-info
+
+# 4. If using assumed role, check the role ARN matches the access entry
+# Access entries use the role ARN, not the assumed-role session ARN
+# Role ARN:          arn:aws:iam::123456789012:role/MyRole
+# Session ARN:       arn:aws:sts::123456789012:assumed-role/MyRole/session-name
+# Access entry must reference the IAM role ARN, not the session ARN
+```
+
+---
+
+## 5. Continuous Provisioning (EKS)
+
+### How It Works
+
+In traditional provisioning, the cluster waits until all instances are ready
+before transitioning to InService. With Continuous Provisioning:
+
+1. Cluster transitions to InService as soon as the control plane is ready
+2. Instances are created asynchronously
+3. Instance failures are reported as events, not cluster failures
+4. Failed instances can be individually replaced
+
+### Monitoring Instance Creation
+
+```bash
+# Poll cluster events for provisioning updates
+watch -n 30 "aws sagemaker list-cluster-events --cluster-name <C> --region <R> \
+  --query 'ClusterEventSummaries[0:5].{Time:EventTime,Msg:Message}' --output table"
+
+# Check instance group fill rate
+watch -n 30 "aws sagemaker describe-cluster --cluster-name <C> --region <R> \
+  --query 'InstanceGroups[*].{Name:InstanceGroupName,Current:CurrentCount,Target:InstanceCount}' --output table"
+```
+
+### When Nodes Don't Appear in kubectl
+
+If `list-cluster-nodes` shows nodes but `kubectl get nodes` doesn't:
+
+1. Check lifecycle script logs — the script registers nodes with EKS
+2. Verify the EKS cluster endpoint is reachable from worker subnets
+3. Check if kubelet is running on the node (via SSM)
+4. Verify the node's IAM role has the `AmazonEKSWorkerNodePolicy`
+
+---
+
+## 6. SSM Target Format
+
+### Format Specification
+
+```
+sagemaker-cluster:<CLUSTER_ID>_<INSTANCE_GROUP_NAME>-<INSTANCE_ID>
+```
+
+- `CLUSTER_ID`: The alphanumeric ID from the cluster ARN (not the cluster name)
+- `INSTANCE_GROUP_NAME`: The instance group name as configured
+- `INSTANCE_ID`: The EC2 instance ID (e.g., `i-0abc123def456789`)
+
+### Constructing the Target
+
+```bash
+# Get cluster ID
+CLUSTER_ID=$(aws sagemaker describe-cluster --cluster-name <C> --region <R> \
+  --query 'ClusterArn' --output text | cut -d/ -f2)
+
+# List nodes with group info
+aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,Group:InstanceGroupName}' --output table
+
+# Construct target
+TARGET="sagemaker-cluster:${CLUSTER_ID}_<group-name>-<instance-id>"
+```
+
+### Common SSM Mistakes
+
+| Mistake                          | Example                                     | Correct                                 |
+| -------------------------------- | ------------------------------------------- | --------------------------------------- |
+| Using bare instance ID           | `i-0abc123`                                 | `sagemaker-cluster:xyz_group-i-0abc123` |
+| Using cluster name instead of ID | `sagemaker-cluster:my-cluster_...`          | Use cluster ID from ARN                 |
+| Wrong region                     | `--region us-east-1` (cluster in us-west-2) | Match cluster region                    |
+| Missing SSM plugin               | `SessionManagerPlugin is not found`         | Install plugin first                    |
+
+### SSH over SSM
+
+Add to `~/.ssh/config`:
+
+```
+Host hyperpod-<node-name>
+  HostName sagemaker-cluster:<cluster-id>_<group>-<instance-id>
+  User ubuntu
+  IdentityFile ~/keys/my-key.pem
+  ProxyCommand aws --profile default --region <REGION> ssm start-session --target %h --document-name AWS-StartSSHSession --parameters portNumber=%p
+```
+
+**Important:** Add your SSH public key to `~/.ssh/authorized_keys` on the target node via an SSM session before using SSH over SSM.
+
+---
+
+## 7. Node Replacement Decision Tree
+
+```
+Node appears unhealthy
+    │
+    ├── Is NodeRecovery enabled?
+    │   ├── No → Enable via update-cluster, or trigger manual replacement
+    │   └── Yes ↓
+    │
+    ├── Has health agent detected the issue?
+    │   ├── Check health monitoring CW logs
+    │   ├── EKS: check node-health-status label
+    │   ├── Slurm: check sinfo reason (must be "Action:Replace" or "Action:Reboot")
+    │   └── If not detected → wait for next cycle or trigger manually
+    │
+    ├── Was replacement triggered?
+    │   ├── Check cluster events for replacement activity
+    │   └── If triggered ↓
+    │
+    ├── Did replacement succeed?
+    │   ├── Check lifecycle script logs for new instance
+    │   ├── If lifecycle script failed → fix script, retry
+    │   ├── If capacity error → check AZ, use reserved capacity
+    │   └── If succeeded → verify new node is healthy
+    │
+    └── Manual fallback
+        ├── Try reboot first: batch-reboot-cluster-nodes
+        └── If reboot doesn't help: batch-replace-cluster-nodes
+```
+
+### Batch API Commands
+
+**Reboot** (less disruptive — preserves node identity):
+
+```bash
+aws sagemaker batch-reboot-cluster-nodes \
+  --cluster-name <C> --region <R> \
+  --node-ids '["i-0abc123"]'
+```
+
+**Replace** (new instance — loses local state):
+
+```bash
+aws sagemaker batch-replace-cluster-nodes \
+  --cluster-name <C> --region <R> \
+  --node-ids '["i-0abc123"]'
+```
+
+**Notes:**
+
+- Batch commands return success/failure indicating whether the service accepted the request
+- Use `list-cluster-events` to monitor progress after the command
+- **Limit: 25 node IDs per call.** For larger fleets, split into chunks of 25 and issue multiple calls. Passing more than 25 returns `ValidationException`.
+- Cluster must be in `InService` state
+- Legacy methods (Slurm node reason, K8s labels) are less reliable than batch commands
+
+---
+
+## 8. Slurm Node Name to Instance ID Mapping
+
+### Background
+
+On HyperPod Slurm clusters, nodes are named using their private IP addresses
+in the format `ip-10-1-123-45`. Many AWS operations (SSM, node replacement,
+CloudWatch logs) require the EC2 instance ID.
+
+### Methods
+
+**Method 1 — resource_config.json** (fastest, requires head node access):
+
+```bash
+NODE="ip-10-1-123-45"
+IP=$(echo $NODE | sed 's/ip-//; s/-/./g')
+sudo cat /opt/ml/config/resource_config.json | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+ip = '$IP'
+for group in data.get('InstanceGroups', []):
+    for instance in group.get('Instances', []):
+        if ip in str(instance.get('CustomerIpAddress', '')) or ip in str(instance.get('PrivateDnsName', '')):
+            print(f\"InstanceId: {instance.get('InstanceId')}\")
+            print(f\"Group: {group.get('Name')}\")
+            print(f\"IP: {instance.get('CustomerIpAddress')}\")
+"
+```
+
+**Method 2 — HyperPod APIs** (works from anywhere):
+
+```bash
+aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,DNS:PrivateDnsHostname,Group:InstanceGroupName}' \
+  --output table | grep "10.1.123.45"
+```
+
+**Method 3 — dump_cluster_nodes_info.py** (for recurring lookups on large clusters):
+
+```bash
+wget https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/main/1.architectures/5.sagemaker-hyperpod/tools/dump_cluster_nodes_info.py
+python3 dump_cluster_nodes_info.py --cluster-name <C>
+# Produces CSV: instance_id, private_ip, hostname, instance_type, az, status
+```
+
+---
+
+## 9. Slurm-Specific Operations
+
+Cluster-level Slurm operations — the per-node operations (resuming nodes, fixing individual Slurm states) live in the `hyperpod-node-debugger` skill.
+
+### Diagnose controller health
+
+```bash
+# Via SSM on the controller node:
+# 1. slurmctld is responding
+scontrol ping
+
+# 2. Controller state
+systemctl status slurmctld
+
+# 3. munge (auth) is running — without this, every slurm command fails
+systemctl is-active munge
+systemctl status munge
+
+# 4. Accounting DB reachable (if slurmdbd is used)
+systemctl is-active slurmdbd
+```
+
+### slurmctld is down
+
+Check logs for root cause first:
+
+```bash
+# Look at the most recent lines for panics, OOM, DB auth failure:
+journalctl -u slurmctld --since "1 hour ago" --no-pager | tail -100
+tail -200 /var/log/slurm/slurmctld.log
+```
+
+Common causes:
+
+- **Out of memory** (controller hit memory limit): restart the service, then investigate job scale that triggered the OOM.
+- **Munge auth failure** (`Invalid authentication credential`): munge key mismatch between controller and nodes. Re-sync `/etc/munge/munge.key` via SSM on every node and restart munge + slurmctld.
+- **Accounting DB unreachable**: if using slurmdbd + MariaDB/RDS, check network path and credentials. slurmctld will not start if accounting is required but unreachable.
+- **Config error in slurm.conf**: `slurmctld -D -vvv` (foreground) prints the parse error. Roll back to the previous good `slurm.conf`.
+
+Restart once the cause is resolved:
+
+```bash
+sudo systemctl restart slurmctld
+scontrol ping   # must return "Slurmctld(primary) is UP"
+```
+
+### munge is inactive
+
+```bash
+sudo systemctl start munge
+sudo systemctl status munge
+# If start fails: check /etc/munge/munge.key ownership (must be munge:munge, mode 0400)
+ls -l /etc/munge/munge.key
+```
+
+If the key file is present but auth still fails, check it is IDENTICAL across all nodes:
+
+```bash
+sudo md5sum /etc/munge/munge.key   # run on controller + a sample compute node
+```
+
+Mismatches break auth cluster-wide — re-distribute the controller's key to all nodes and restart munge everywhere.
+
+### Stuck jobs (PENDING / COMPLETING / CONFIGURING)
+
+```bash
+# List stuck jobs
+squeue -o "%i %j %T %R %N" --noheader | grep -iE "COMPLETING|CONFIGURING|PENDING"
+
+# Inspect a specific job
+scontrol show job <JOBID>
+
+# Cancel (if safe to do so)
+scancel <JOBID>
+
+# For stuck COMPLETING jobs that won't clear, sometimes the only recovery is
+# restarting slurmctld after identifying the stuck node(s).
+```
+
+Investigate the reason column — common `(Reason)` values:
+
+- `(Resources)` — waiting for enough free nodes. Normal if cluster is busy; check `sinfo -o "%P %a %l %D %T"` to see why.
+- `(AssocGrpNodeLimit)` / `(QOSMaxJobsPerUserLimit)` — quota-related. Check account limits with `sacctmgr show assoc`.
+- `(NodeDown)` — partition has no healthy nodes. Fix with the `hyperpod-node-debugger` skill.
+- `(BeginTime)` — job scheduled for a future start time.
+
+### State preservation on restart
+
+By default, `sudo systemctl restart slurmctld` preserves cluster state: running jobs keep executing, pending jobs stay in the queue, node states are restored from disk, and job history / accounting is intact. The restart clears only the controller's in-memory caches, stale comm channels, hung internal processes, and resource-allocation computations.
+
+### Clean start (use with caution)
+
+If the saved state is corrupted and a normal restart still loops, validate with `-c` (clean start — ignore saved state). **`slurmctld -c` does not exit on its own; it keeps running like a normal slurmctld but with a clean slate.** Do not run it alongside the systemd unit — they will fight for port 6817.
+
+```bash
+# IMPORTANT: drops all running jobs, pending queue, and saved node states.
+# Notify users first — every running job will be lost.
+sudo systemctl stop slurmctld
+
+# Option A (recommended) — foreground validation, easy to abort:
+sudo slurmctld -c -D -vvv 2>&1 | tee /tmp/slurmctld-clean.log
+#   • Watch for "Slurmctld(primary) is UP"  (typically <10 s)
+#   • Verify with a second shell: scontrol ping
+#   • Ctrl-C to stop, then hand control back to systemd:
+sudo systemctl start slurmctld
+```
+
+> **IMPORTANT:** do not add `-c` to a systemd drop-in unit. Persisting `-c` in
+> `/etc/systemd/system/slurmctld.service.d/override.conf` means every future
+> slurmctld restart (including automatic restarts after crashes and reboots)
+> wipes all job state. If the one-shot above is the right call, use Option A
+> only — do not edit the unit file.
+
+Use only when state recovery is impossible and you have explicit user signoff.
+
+### Verify after remediation
+
+```bash
+scontrol ping                              # must return "Slurmctld(primary) is UP"
+sinfo                                      # nodes idle/alloc/mixed, no "down*" or "drain"
+squeue -h | wc -l                          # queue length draining normally
+systemctl is-active slurmctld munge
+scontrol show config | grep StateSaveLocation   # confirm state dir is writable and populated
+```
+
+`StateSaveLocation` should point at a persistent path (typically under `/var/spool/slurmctld`); verify the directory exists and is owned by the `slurm` user. A missing or unwritable `StateSaveLocation` is the #1 cause of repeated restarts losing state.
+
+---
+
+## 10. Filesystem Performance
+
+**Applies to**: Common (Slurm and EKS) — training bottlenecked by data loading, checkpoint save/load, or slow script/executable loading.
+
+### Diagnose
+
+Identify which filesystem is slow, then inspect provisioned performance vs. actual throughput.
+
+```bash
+# 1. From the node: what is mounted, and is anything pegged?
+mount | grep -E "fsx|nfs|lustre|ebs|nvme"
+df -hT
+iostat -x 1 5                 # per-device throughput/IOPS/utilization
+
+# 2. For FSx for Lustre — check mount options and per-OST throughput
+lfs df -h                     # storage target utilization (uneven = hotspot)
+lfs getstripe <path>          # striping config; wider stripe = more parallelism
+
+# 3. For FSx for OpenZFS / NFS — check mount options and active I/O
+nfsstat -m                    # per-mount retransmissions and wait times
+nfsiostat 5                   # ops/s, throughput, avg RTT
+
+# 4. For EBS — check volume type and I/O burst credits
+lsblk -o NAME,TYPE,SIZE,MOUNTPOINT
+```
+
+Then in CloudWatch (from your workstation):
+
+```bash
+# FSx for Lustre — throughput saturation
+aws cloudwatch get-metric-statistics \
+  --namespace AWS/FSx \
+  --metric-name DataReadBytes \
+  --dimensions Name=FileSystemId,Value=<FSxId> \
+  --statistics Sum --period 300 \
+  --start-time "$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%SZ)" \
+  --end-time   "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+  --region <REGION>
+
+# Also check: DataWriteBytes, FreeDataStorageCapacity, MetadataOperations
+
+# EBS — throughput and IOPS ceiling
+aws cloudwatch get-metric-statistics \
+  --namespace AWS/EBS \
+  --metric-name VolumeReadOps \
+  --dimensions Name=VolumeId,Value=<vol-id> \
+  --statistics Sum --period 60 \
+  --start-time "$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%SZ)" \
+  --end-time   "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+  --region <REGION>
+
+# Also check: VolumeWriteOps, VolumeReadBytes, VolumeWriteBytes, BurstBalance
+```
+
+### Interpret
+
+| Signal                                                                 | Interpretation                            | Action                                                                                                |
+| ---------------------------------------------------------------------- | ----------------------------------------- | ----------------------------------------------------------------------------------------------------- |
+| FSx Lustre `DataReadBytes` sustained at or near provisioned throughput | Throughput ceiling hit                    | Increase throughput-per-TiB, or grow storage capacity (throughput scales with size)                   |
+| FSx Lustre metadata ops saturated                                      | Small-file workload on Lustre             | Switch small-file traffic to FSx for OpenZFS; keep Lustre for large sequential I/O                    |
+| FSx OpenZFS `TotalIOps` near provisioned IOPS                          | IOPS ceiling hit                          | Increase provisioned IOPS on the filesystem                                                           |
+| EBS `BurstBalance` draining to 0 on `gp2`                              | Baseline IOPS insufficient                | Migrate to `gp3` or `io2` and provision the IOPS and throughput you need                              |
+| `iostat %util` > 90% on the mount device                               | Local device saturated                    | If `/dev/shm` → increase size; if NVMe instance store → already at hardware ceiling, move data layout |
+| Slow only at checkpoint time                                           | Write amplification from many small files | Consolidate checkpoints; use rank-0 writer patterns (DCP `use_shard_dst`)                             |
+
+### Choose the right filesystem for the workload
+
+| Workload pattern                                                          | Best fit                                                 |
+| ------------------------------------------------------------------------- | -------------------------------------------------------- |
+| Large sequential reads (datasets >> 1 MiB), many-reader training          | FSx for Lustre                                           |
+| Small-file / metadata-heavy / mixed random I/O (home dirs, logs, configs) | FSx for OpenZFS                                          |
+| Single-instance scratch / checkpoint staging                              | EBS `gp3` or `io2`                                       |
+| Highest per-GPU throughput, ephemeral                                     | NVMe instance store (`/opt/dlami/nvme`) — non-persistent |
+
+HyperPod Slurm: the default lifecycle script lets you use FSx for OpenZFS for `/home`. Evaluate this if your home directory is on Lustre and you see metadata-op saturation.
+
+### Upgrade provisioned performance
+
+```bash
+# FSx for Lustre — scale storage up (throughput scales with capacity)
+aws fsx update-file-system \
+  --file-system-id <FSxId> \
+  --storage-capacity <GiB> \
+  --region <REGION>
+
+# FSx for OpenZFS — raise provisioned IOPS / throughput
+aws fsx update-file-system \
+  --file-system-id <FSxId> \
+  --open-zfs-configuration '{"ThroughputCapacity":<NEW>, "DiskIopsConfiguration":{"Mode":"USER_PROVISIONED","Iops":<NEW>}}' \
+  --region <REGION>
+
+# EBS — migrate gp2 → gp3 and raise IOPS/throughput
+aws ec2 modify-volume \
+  --volume-id <vol-id> \
+  --volume-type gp3 \
+  --iops <NEW> \
+  --throughput <NEW_MBps> \
+  --region <REGION>
+```
+
+### Verify after remediation
+
+- CloudWatch metric delta: throughput / IOPS climbs and ceases to flat-line at the old ceiling.
+- Training step time decreases; data-loading percentage of step time decreases.
+- `iostat %util` on the mount drops below 80% under sustained load.
+
+### When to escalate
+
+- Provisioned capacity is maxed, training still I/O-bound, and the workload cannot be restructured — request a regional capacity increase from AWS Support and attach CloudWatch metric graphs.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/iam-permissions.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/iam-permissions.md
new file mode 100644
index 00000000..9dc2f27b
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/iam-permissions.md
@@ -0,0 +1,35 @@
+# IAM Permissions Required
+
+## Read-only diagnostic
+
+```json
+{
+  "Action": [
+    "sagemaker:DescribeCluster",
+    "sagemaker:ListClusterNodes",
+    "sagemaker:ListClusterEvents",
+    "sagemaker:ListClusters",
+    "ec2:DescribeSecurityGroups",
+    "ec2:DescribeSubnets",
+    "ec2:DescribeVpcs",
+    "ec2:DescribeVpcEndpoints",
+    "ec2:DescribeInstances",
+    "ec2:DescribeInstanceTypeOfferings",
+    "eks:DescribeCluster",
+    "eks:ListAccessEntries",
+    "eks:ListAddons",
+    "eks:DescribeAddon",
+    "logs:DescribeLogGroups",
+    "logs:DescribeLogStreams",
+    "cloudformation:DescribeStackEvents",
+    "cloudformation:DescribeStacks",
+    "servicequotas:ListServiceQuotas",
+    "ssm:StartSession",
+    "ssm:TerminateSession"
+  ]
+}
+```
+
+> SSM on HyperPod uses `start-session` against `sagemaker-cluster:<cluster-id>_<group>-<iid>` targets, not `send-command` against plain instance IDs (HyperPod's managed instance fleet does not expose bare instance IDs to customer `SendCommand` calls). Grant `ssm:StartSession` and `ssm:TerminateSession` — not `ssm:SendCommand` / `ssm:GetCommandInvocation`.
+
+For each remediation the operator may run, the matching write permission is required (for example `ec2:AuthorizeSecurityGroupIngress` / `Egress`, `eks:CreateAccessEntry`, `iam:CreateServiceLinkedRole`).
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/lifecycle-scripts.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/lifecycle-scripts.md
new file mode 100644
index 00000000..a72c3d97
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/lifecycle-scripts.md
@@ -0,0 +1,358 @@
+# Lifecycle Script Reference for HyperPod Clusters
+
+Deep-dive companion to the main [SKILL.md](../SKILL.md) § C (Lifecycle Scripts) and to [cluster-operations.md § 3 (Lifecycle Script Debugging)](cluster-operations.md). Lifecycle scripts run on each node during provisioning — they configure the software stack (Slurm daemons, filesystem mounts, container runtimes, EFA drivers). A failure here blocks the node (and often the entire cluster) from reaching `InService`.
+
+---
+
+## S3 Structure
+
+### Slurm Lifecycle Scripts
+
+```
+s3://sagemaker-lifecycle-<guid>/
+  on_create.sh                       # Entry point (REQUIRED) — calls lifecycle_script.py
+  lifecycle_script.py                 # Main orchestration — detects node type, runs scripts in order
+  config.py                          # Feature toggles (Docker, observability, SSSD, etc.)
+  provisioning_parameters.json       # Cluster topology parameters
+  start_slurm.sh                     # Starts slurmctld (controller) or slurmd (compute)
+  mount_fsx.sh                       # FSx Lustre mount
+  mount_fsx_openzfs.sh               # FSx OpenZFS mount (optional)
+  add_users.sh                       # Creates users from shared_users.txt
+  shared_users.txt                   # username,uid,home per line
+  setup_mariadb_accounting.sh        # Local DB for Slurm accounting
+  setup_rds_accounting.sh            # RDS-based Slurm accounting (optional)
+  setup_user_associations.sh         # Slurm user/account/QOS associations
+  apply_hotfix.sh                    # Runs all scripts in hotfix/ directory
+  prolog.sh                          # Slurm job prolog (GPU device mapping)
+  epilog.sh                          # Slurm job epilog (GPU cleanup)
+  setup_sssd.py                      # Active Directory / LDAP integration (optional)
+  hotfix/                            # Drop-in hotfix scripts (run alphabetically)
+  multi_headnode_setup/              # Multi-head-node Slurm configuration
+  observability/                     # Prometheus exporters, OTEL, metrics
+  utils/
+    install_ansible.sh               # Always first — required by other scripts
+    install_docker.sh                # Docker + NVIDIA Container Toolkit
+    install_enroot_pyxis.sh          # Enroot + Pyxis for Slurm container jobs
+    motd.sh                          # Message of the day
+    fsx_ubuntu.sh                    # FSx client for Ubuntu
+    fsx_auto_detect.sh               # Auto-detect FSx filesystem
+    gen-keypair-ubuntu.sh            # SSH key generation
+    ssh-to-compute.sh                # SSH configuration (controller → compute)
+    mount-s3.sh                      # Mountpoint for Amazon S3
+    enable_slurm_log_rotation.sh     # Slurm log rotation
+    slurm_fix_plugstackconf.sh       # SPANK plugin config fix
+    pam_adopt_cgroup_wheel.sh        # PAM cgroup adoption for Slurm
+```
+
+### EKS Lifecycle Scripts
+
+```
+s3://sagemaker-lifecycle-<guid>/
+  on_create.sh                       # Entry point — calls on_create_main.sh
+  on_create_main.sh                  # Configures containerd, kubelet, FSx client, EFA
+```
+
+### S3 URI Validation
+
+The `SourceS3Uri` in the cluster configuration must:
+
+- Start with `s3://`
+- The `OnCreate` filename must match an actual S3 key in that prefix
+- The execution role must have `s3:GetObject` and `s3:ListBucket` on the bucket
+
+---
+
+## Execution Flow
+
+### Slurm: Detailed Order
+
+```
+1. on_create.sh
+   ├── Creates /var/log/provision/provisioning.log
+   ├── Sleeps 30s (DNS stabilization)
+   ├── Reads /opt/ml/config/resource_config.json
+   └── Calls: python3 lifecycle_script.py
+
+2. lifecycle_script.py
+   ├── install_ansible.sh          ← always first
+   ├── mount_fsx.sh                ← if FSx DNS+mountname in config
+   ├── mount_fsx_openzfs.sh        ← if enable_fsx_openzfs=True in config.py
+   ├── add_users.sh
+   ├── [Detect node type via IP matching against resource_config.json]
+   │   ├── Controller: matched against controller_group IPs
+   │   ├── Login: matched against login_group IPs
+   │   └── Compute: everything else
+   ├── Wait for slurm.conf to contain controller IPs
+   ├── [Controller only] setup_mariadb_accounting.sh OR multi_headnode_setup
+   ├── apply_hotfix.sh             ← runs hotfix/*.sh alphabetically
+   ├── motd.sh
+   ├── fsx_ubuntu.sh / fsx_auto_detect.sh
+   ├── start_slurm.sh             ← slurmctld (controller) or slurmd (compute/login)
+   ├── [Controller only] setup_user_associations.sh
+   ├── gen-keypair-ubuntu.sh
+   ├── ssh-to-compute.sh
+   ├── [Optional] Docker/Enroot/Pyxis
+   ├── [Optional] Observability stack
+   ├── [Optional] SSSD
+   ├── [Optional] pam_slurm_adopt
+   ├── [Optional] mount-s3
+   └── [Optional] Slurm log rotation
+```
+
+### EKS: Detailed Order
+
+```
+1. on_create.sh → on_create_main.sh
+   ├── Configure containerd storage at /opt/sagemaker
+   ├── Wait for disk mount (up to 60s)
+   ├── Create kubelet symlink: /var/lib/kubelet → /opt/sagemaker
+   ├── Install FSx Lustre client (if EFA available)
+   ├── Load kernel modules: lnet, lustre
+   └── Error handling: wait 60s before exit (ensure log upload)
+```
+
+### Node Type Detection (Slurm)
+
+The lifecycle script reads `/opt/ml/config/resource_config.json` and matches the current node's IP against the instance groups to determine if it is a controller, login, or compute node.
+
+**Controller nodes provision first.** Compute and login nodes wait for the controller to be ready (specifically, they wait for `slurm.conf` to contain the controller IPs on the shared filesystem).
+
+If the controller's lifecycle script fails, **all compute nodes will also fail** because they cannot find `slurm.conf`.
+
+---
+
+## config.py Feature Toggles
+
+| Toggle                       | Default | What It Does                                             |
+| ---------------------------- | ------- | -------------------------------------------------------- |
+| `enable_docker_enroot_pyxis` | `True`  | Installs Docker, NVIDIA Container Toolkit, Enroot, Pyxis |
+| `enable_observability`       | `False` | Installs Prometheus exporters, OTEL collector            |
+| `enable_pam_slurm_adopt`     | `False` | PAM module for Slurm cgroup enforcement                  |
+| `enable_sssd`                | `False` | Active Directory / LDAP integration                      |
+| `enable_mount_s3`            | `False` | Mountpoint for Amazon S3                                 |
+| `enable_fsx_openzfs`         | `False` | FSx OpenZFS filesystem support                           |
+| `enable_slurm_log_rotation`  | `True`  | Rotate Slurm daemon logs                                 |
+| `s3_bucket`                  | `""`    | Required if `enable_mount_s3=True`                       |
+
+Modify `config.py` in S3 before cluster creation to enable/disable features.
+
+---
+
+## Common Lifecycle Errors and Fixes
+
+### S3 Access Errors
+
+**Error:** `Connect timeout on endpoint URL: s3://...`
+**Cause:** No S3 VPC endpoint; node cannot reach S3 from private subnet.
+**Fix:** Add an S3 Gateway VPC endpoint to the subnet's route table:
+
+```bash
+aws ec2 create-vpc-endpoint \
+  --vpc-id <VPC_ID> \
+  --service-name com.amazonaws.<REGION>.s3 \
+  --route-table-ids <ROUTE_TABLE_ID> \
+  --vpc-endpoint-type Gateway
+```
+
+**Error:** `AccessDenied` / `403 Forbidden` on S3 GetObject
+**Cause:** Execution role missing S3 permissions.
+**Fix:** Add to the execution role's IAM policy:
+
+```json
+{
+  "Effect": "Allow",
+  "Action": ["s3:GetObject", "s3:ListBucket"],
+  "Resource": [
+    "arn:aws:s3:::sagemaker-lifecycle-*",
+    "arn:aws:s3:::sagemaker-lifecycle-*/*"
+  ]
+}
+```
+
+### Script Execution Errors
+
+**Error:** `No such file or directory` for the entry script
+**Cause:** `OnCreate` filename doesn't match S3 key.
+**Fix:** Verify `on_create.sh` exists at the exact path in S3:
+
+```bash
+aws s3 ls s3://<BUCKET>/ | grep on_create
+```
+
+**Error:** `CRLF line terminators` or `\r: command not found`
+**Cause:** Script edited on Windows has `\r\n` line endings.
+**Fix:**
+
+```bash
+# Convert before uploading:
+dos2unix on_create.sh
+# Or via sed:
+sed -i 's/\r$//' on_create.sh
+# Verify:
+file on_create.sh   # Should show "ASCII text" NOT "with CRLF line terminators"
+```
+
+**Error:** `Permission denied` when executing script
+**Cause:** Script not marked executable.
+**Fix:** HyperPod downloads scripts from S3 and should set execute permissions automatically. If this fails:
+
+- Ensure the script has a proper shebang: `#!/bin/bash`
+- Upload with correct content type: `aws s3 cp on_create.sh s3://<BUCKET>/ --content-type text/x-shellscript`
+
+**Error:** Script hangs indefinitely (lifecycle timeout)
+**Cause:** Blocking operation without timeout, infinite loop, or waiting for a resource that never becomes available.
+**Fix:**
+
+- Add `set -euo pipefail` at the top of scripts
+- Add timeouts to network operations
+- Test scripts locally before uploading
+- Check if compute nodes are waiting for controller (controller lifecycle must succeed first)
+
+### provisioning_parameters.json Errors
+
+**Error:** `provisioning_parameters.json` parsing failure or `KeyError`
+**Cause:** Instance group names in `provisioning_parameters.json` don't match the names in the `create-cluster` API call.
+**Fix:** Ensure exact match between:
+
+- `InstanceGroupName` in the `create-cluster` API call
+- `controller_group`, `login_group`, `worker_group_<N>` in `provisioning_parameters.json`
+
+### Slurm-Specific Errors
+
+**Error:** Compute nodes fail because `slurm.conf` not found
+**Cause:** Controller node's lifecycle script failed, so `slurm.conf` was never written to shared storage.
+**Fix:** Fix the controller node's lifecycle script first. Controller errors cascade to all compute nodes.
+
+**Error:** `slurmctld: error: ...`
+**Cause:** Slurm configuration error.
+**Fix:** Check `/var/log/slurmctld.log` on the controller node (via SSM). Common issues:
+
+- Wrong `SlurmctldHost` in `slurm.conf`
+- Incorrect partition/node definitions
+- Missing MUNGE key
+
+### FSx Mount Errors
+
+**Error:** `mount.lustre: ... Connection timed out`
+**Cause:** FSx filesystem in different VPC/AZ or security group doesn't allow traffic.
+**Fix:**
+
+- FSx and HyperPod nodes must be in the same VPC
+- Security group must allow traffic between nodes and FSx (TCP 988, 1018-1023)
+- Verify FSx filesystem is in `AVAILABLE` state
+
+---
+
+## Debugging Procedures
+
+### Read Lifecycle Logs from CloudWatch
+
+```bash
+CLUSTER_ID=$(aws sagemaker describe-cluster --cluster-name <NAME> --region <R> \
+  --query 'ClusterArn' --output text | cut -d/ -f2)
+LOG_GROUP="/aws/sagemaker/Clusters/<CLUSTER_NAME>/${CLUSTER_ID}"
+
+# List all lifecycle log streams:
+aws logs describe-log-streams \
+  --log-group-name "$LOG_GROUP" --region <R> \
+  --query 'logStreams[?starts_with(logStreamName,`LifecycleConfig`)].{Stream:logStreamName,LastEvent:lastEventTimestamp}' \
+  --output table
+
+# Read last 100 events from a specific stream:
+aws logs get-log-events \
+  --log-group-name "$LOG_GROUP" \
+  --log-stream-name "LifecycleConfig/<GROUP>/<INSTANCE_ID>" \
+  --region <R> --limit 100 \
+  --query 'events[*].message' --output text
+```
+
+### Read Lifecycle Logs On-Node (via SSM)
+
+If the node reached a state where SSM is available:
+
+```bash
+# Full provisioning log:
+cat /var/log/provision/provisioning.log
+
+# Resource config (node topology):
+cat /opt/ml/config/resource_config.json
+
+# Slurm config (if generated):
+cat /opt/slurm/etc/slurm.conf
+
+# Node metadata:
+cat /opt/ml/metadata/resource-metadata.json
+```
+
+### Test Scripts Locally
+
+Before deploying to a cluster, test lifecycle scripts on a compatible EC2 instance:
+
+1. Launch an instance with the same AMI and instance type
+2. Install the same IAM role
+3. Create a mock `/opt/ml/config/resource_config.json`
+4. Run the scripts manually and check for errors
+
+### Compare Against Reference Scripts
+
+Always diff your scripts against the latest upstream versions:
+
+- **Slurm:** https://github.com/aws-samples/awsome-distributed-training/tree/main/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config
+- **EKS:** https://github.com/aws-samples/awsome-distributed-training/tree/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config
+
+Check the commit history for recent bug fixes — upstream fixes often resolve lifecycle failures.
+
+---
+
+## IAM Permissions for Lifecycle Scripts
+
+The execution role attached to the instance group needs:
+
+**S3 access (lifecycle script download):**
+
+```json
+{
+  "Effect": "Allow",
+  "Action": ["s3:GetObject", "s3:ListBucket"],
+  "Resource": [
+    "arn:aws:s3:::<BUCKET_NAME>",
+    "arn:aws:s3:::<BUCKET_NAME>/*"
+  ]
+}
+```
+
+**CloudWatch logs (script output):**
+
+```json
+{
+  "Effect": "Allow",
+  "Action": [
+    "logs:CreateLogGroup",
+    "logs:CreateLogStream",
+    "logs:PutLogEvents"
+  ],
+  "Resource": "arn:aws:logs:<REGION>:<ACCOUNT>:log-group:/aws/sagemaker/Clusters/*"
+}
+```
+
+**VPC operations:**
+
+```json
+{
+  "Effect": "Allow",
+  "Action": [
+    "ec2:CreateNetworkInterface",
+    "ec2:CreateNetworkInterfacePermission",
+    "ec2:DeleteNetworkInterface",
+    "ec2:DeleteNetworkInterfacePermission",
+    "ec2:DescribeNetworkInterfaces",
+    "ec2:DescribeVpcs",
+    "ec2:DescribeDhcpOptions",
+    "ec2:DescribeSubnets",
+    "ec2:DescribeSecurityGroups",
+    "ec2:DetachNetworkInterface",
+    "ec2:CreateTags"
+  ],
+  "Resource": "*"
+}
+```
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/scripts/diagnose-cluster.sh b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/scripts/diagnose-cluster.sh
new file mode 100755
index 00000000..94cdd833
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/scripts/diagnose-cluster.sh
@@ -0,0 +1,1609 @@
+#!/usr/bin/env bash
+# diagnose-cluster.sh — read-only HyperPod cluster-level diagnostic.
+#
+# Collects cluster status, events, node health, VPC/SG, EKS access + add-ons,
+# SSM readiness, CloudWatch availability, and Slurm controller health.
+#
+# Read-only: never modifies cluster state, never prints remediation commands.
+# Each [FAIL] / added issue carries a pointer of the form
+#   "... → references/cluster-diagnostics-detail.md § <section>"
+# which the hyperpod-cluster-debugger skill uses to look up remediation.
+#
+# Supports both EKS and Slurm orchestrators.
+#
+# Usage:
+#   bash diagnose-cluster.sh --cluster <name-or-arn> --region <region>
+#   bash diagnose-cluster.sh --cluster <name-or-arn> --region <region> --no-color
+#
+# Exit codes:
+#   0 — all checks passed (warnings may still be present)
+#   1 — one or more critical checks failed
+
+set -euo pipefail
+
+# Prerequisite tools. jq is a hard requirement per skill-creator safety rules.
+for cmd in aws jq python3; do
+  command -v "$cmd" &>/dev/null || {
+    echo "ERROR: '$cmd' is required but not found. Install it and retry."
+    exit 1
+  }
+done
+
+CLUSTER=""
+REGION="${AWS_DEFAULT_REGION:-us-east-1}"
+USE_COLOR=true
+VALIDATE_MODE=false
+VALIDATE_SG_IDS=""
+VALIDATE_SUBNET_IDS=""
+VALIDATE_IAM_ROLE=""
+VALIDATE_S3_URI=""
+VALIDATE_INSTANCE_TYPE=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cluster)    [[ $# -lt 2 ]] && { echo "ERROR: --cluster needs a value"; exit 2; }
+                  [[ ! "$2" =~ ^(arn:aws[a-z-]*:sagemaker:[a-z0-9-]+:[0-9]{12}:cluster/[a-z0-9]{12}|[a-zA-Z0-9]([-a-zA-Z0-9]{0,62}))$ ]] && { echo "ERROR: --cluster must be a valid HyperPod cluster name or ARN (got '$2')"; exit 2; }
+                  CLUSTER="$2"; shift 2 ;;
+    --region)     [[ $# -lt 2 ]] && { echo "ERROR: --region needs a value"; exit 2; }
+                  [[ ! "$2" =~ ^[a-z]{2}-[a-z]+-[0-9]+$ ]] && { echo "ERROR: --region must be a valid AWS region (got '$2')"; exit 2; }
+                  REGION="$2"; shift 2 ;;
+    --sg-ids)        [[ $# -lt 2 ]] && { echo "ERROR: --sg-ids needs a value";        exit 2; }; VALIDATE_SG_IDS="$2";        shift 2 ;;
+    --subnet-ids)    [[ $# -lt 2 ]] && { echo "ERROR: --subnet-ids needs a value";    exit 2; }; VALIDATE_SUBNET_IDS="$2";    shift 2 ;;
+    --iam-role)      [[ $# -lt 2 ]] && { echo "ERROR: --iam-role needs a value";      exit 2; }; VALIDATE_IAM_ROLE="$2";      shift 2 ;;
+    --s3-uri)        [[ $# -lt 2 ]] && { echo "ERROR: --s3-uri needs a value";        exit 2; }; VALIDATE_S3_URI="$2";        shift 2 ;;
+    --instance-type) [[ $# -lt 2 ]] && { echo "ERROR: --instance-type needs a value"; exit 2; }; VALIDATE_INSTANCE_TYPE="$2"; shift 2 ;;
+    --no-color)   USE_COLOR=false;           shift ;;
+    --validate)   VALIDATE_MODE=true;        shift ;;
+    -h|--help)
+      cat <<'EOF'
+Usage: diagnose-cluster.sh --cluster <name-or-arn> --region <region> [--no-color]
+       diagnose-cluster.sh --validate --region <region> \
+         --sg-ids <sg-1,sg-2> --subnet-ids <sub-1,sub-2> [--iam-role <role-arn>] \
+         [--s3-uri s3://bucket/path/] [--instance-type ml.p5.48xlarge]
+
+Read-only diagnostic for HyperPod cluster-level issues: provisioning, access,
+node replacement, VPC/SG, EKS config + add-ons, SSM, CloudWatch logs. Each
+[FAIL] line in the summary includes a pointer of the form
+  "→ references/cluster-diagnostics-detail.md § <section>"
+so the hyperpod-cluster-debugger skill can look up the remediation runbook.
+
+The script never modifies cluster state and never prints remediation commands.
+
+Modes:
+  (default)   Diagnose an existing cluster.
+  --validate  Pre-flight config validation (validates SGs / subnets / IAM /
+              VPC endpoints / optional S3 lifecycle scripts / optional per-AZ
+              instance-type capacity before creating a cluster; no cluster
+              needed).
+
+See references/cluster-diagnostics-detail.md for full remediation runbooks.
+See references/capacity-planning.md, lifecycle-scripts.md, cloudformation-errors.md
+for deep-dive companions to sections B / C / H.
+EOF
+      exit 0
+      ;;
+    *) echo "Unknown argument: $1"; exit 2 ;;
+  esac
+done
+
+if ! "$VALIDATE_MODE"; then
+  [[ -z "$CLUSTER" ]] && echo "Usage: $0 --cluster <name-or-arn> --region <region>" && exit 1
+fi
+
+# Colors — auto-disable when stdout is not a TTY (agent-piped / redirected output)
+# or when TERM=dumb. Customer explicitly passed --no-color overrides this anyway.
+if ! [ -t 1 ] || [ "${TERM:-}" = "dumb" ]; then
+  USE_COLOR=false
+fi
+if "$USE_COLOR"; then
+  RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+  CYAN='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'
+else
+  RED=''; GREEN=''; YELLOW=''; CYAN=''; BOLD=''; NC=''
+fi
+
+CALLER_IDENTITY=$(aws sts get-caller-identity --output json 2>&1) || {
+  echo -e "${RED}ERROR: AWS credentials not configured or expired.${NC}"
+  echo "$CALLER_IDENTITY"
+  echo ""
+  echo "→ references/cluster-diagnostics-detail.md § D (EKS Access / kubectl) for credential setup"
+  exit 1
+}
+CALLER_ARN=$(echo "$CALLER_IDENTITY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('Arn','unknown'))" 2>/dev/null || echo "unknown")
+
+CRITICAL_FAILURES=0
+WARNINGS=0
+ISSUES_FOUND=()
+
+pass()    { echo -e "  ${GREEN}[PASS]${NC}  $1${2:+ — $2}"; }
+fail()    { CRITICAL_FAILURES=$((CRITICAL_FAILURES+1)); echo -e "  ${RED}[FAIL]${NC}  $1${2:+ — $2}"; }
+warn()    { WARNINGS=$((WARNINGS+1)); echo -e "  ${YELLOW}[WARN]${NC}  $1${2:+ — $2}"; }
+info()    { echo -e "         $1${2:+ — $2}"; }
+header()  { echo ""; echo -e "${BOLD}--- $1 ---${NC}"; }
+section() { echo ""; echo -e "${BOLD}=== $1 ===${NC}"; }
+
+add_issue() {
+  local priority="${2:-P1}"
+  ISSUES_FOUND+=("${priority}|$1")
+}
+
+_CD_TEMP_FILES=()
+trap '[[ ${#_CD_TEMP_FILES[@]} -gt 0 ]] && rm -f "${_CD_TEMP_FILES[@]}" 2>/dev/null || true' EXIT
+
+# Run a shell command on a HyperPod instance via SSM.
+# See ssm_run_on_node in skills/hyperpod-node-debugger/scripts/triage-cluster.sh
+# for the rationale behind the sagemaker-cluster target + base64 payload.
+ssm_run_on_node() {
+  local iid="$1" grp="$2" cmd="$3"
+  [[ -z "$iid" || -z "$grp" || -z "$cmd" ]] && return 1
+  [[ ! "$iid" =~ ^i-[0-9a-f]{8,17}$ ]] && return 1
+  [[ -z "${CLUSTER_ID:-}" ]] && return 1
+  [[ ! "$grp" =~ ^[A-Za-z0-9._-]+$ ]] && return 1
+
+  local target="sagemaker-cluster:${CLUSTER_ID}_${grp}-${iid}"
+  local tmp; tmp=$(mktemp 2>/dev/null) || return 1
+  chmod 600 "$tmp" 2>/dev/null || true
+  _CD_TEMP_FILES+=("$tmp")
+  local cmd_b64
+  cmd_b64=$(printf '%s' "$cmd" | base64 | tr -d '\n') || return 1
+  local remote="bash -c \"echo $cmd_b64 | base64 -d | bash\""
+  python3 -c "import json,sys; print(json.dumps({'command':[sys.argv[1]]}))" "$remote" > "$tmp" || return 1
+
+  local attempt=0 out rc
+  while (( attempt < 5 )); do
+    out=$(timeout 180 aws ssm start-session \
+      --target "$target" \
+      --document-name AWS-StartNonInteractiveCommand \
+      --parameters "file://$tmp" \
+      --region "$REGION" 2>&1)
+    rc=$?
+    # Retry transient SSM transport errors (rc=0 with EOF/plugin/timeout in stdout).
+    if (( rc == 0 )) && ! echo "$out" | grep -qiE "Cannot perform start session|EOF$|SessionManagerPlugin is not found|ERROR: Unable to|i/o timeout"; then
+      # Strip SSM session banners and the echoed base64 command line.
+      echo "$out" | grep -vE '^(Starting session with SessionId:|Exiting session with sessionId:|\s*$)' \
+                  | grep -vE "^(bash -c \"echo [A-Za-z0-9+/=]+ \| base64 -d \| bash\"|echo '[A-Za-z0-9+/=]+'|[A-Za-z0-9+/=]{40,}={0,2})[[:space:]]*\|?[[:space:]]*base64?[[:space:]]*-?d?[[:space:]]*\|?[[:space:]]*bash\"?\$" || true
+      return 0
+    fi
+    if echo "$out" | grep -qiE "ThrottlingException|RequestLimitExceeded|InternalFailure|InternalError|ServiceUnavailable|TooManyUpdates|Cannot perform start session|EOF$|SessionManagerPlugin is not found|i/o timeout"; then
+      attempt=$((attempt + 1))
+      sleep $((attempt * 3))
+      continue
+    fi
+    echo "$out" >&2
+    return 1
+  done
+  return 1
+}
+
+# Check SG self-referencing rules. Reads SG JSON from stdin, outputs PASS/FAIL/WARN lines.
+check_sg_self_ref() {
+  local sg_id="$1"
+  SG_CHECK_ID="$sg_id" python3 -c "
+import sys, json, os
+sg_id = os.environ['SG_CHECK_ID']
+sgs = json.load(sys.stdin).get('SecurityGroups', [])
+if not sgs:
+    print(f'SKIP:Could not describe {sg_id}')
+    sys.exit(0)
+sg = sgs[0]
+inbound_self = any(
+    any(p.get('GroupId') == sg_id for p in r.get('UserIdGroupPairs', []))
+    for r in sg.get('IpPermissions', [])
+)
+outbound_self = any(
+    any(p.get('GroupId') == sg_id for p in r.get('UserIdGroupPairs', []))
+    for r in sg.get('IpPermissionsEgress', [])
+)
+outbound_all = any(
+    any(r2.get('CidrIp') == '0.0.0.0/0' for r2 in r.get('IpRanges', []))
+    for r in sg.get('IpPermissionsEgress', [])
+)
+if inbound_self:  print(f'PASS:inbound:SG {sg_id}: Inbound self-ref present')
+else:             print(f'FAIL:inbound:SG {sg_id}: Inbound self-ref MISSING — required for inter-node communication')
+if outbound_self: print(f'PASS:outbound:SG {sg_id}: Outbound self-ref present')
+else:             print(f'FAIL:outbound:SG {sg_id}: Outbound self-ref MISSING — required for EFA RDMA traffic')
+if outbound_all:  print(f'PASS:internet:SG {sg_id}: Outbound 0.0.0.0/0 present')
+else:             print(f'WARN:internet:SG {sg_id}: Outbound 0.0.0.0/0 missing — may be needed for AWS API calls')
+" 2>/dev/null || echo ""
+}
+
+# AWS API wrapper that detects permission failures
+aws_check() {
+  local api_label="$1"; shift
+  local result
+  result=$("$@" 2>&1)
+  local rc=$?
+  if [[ $rc -ne 0 ]]; then
+    if echo "$result" | grep -qiE "AccessDenied|UnauthorizedOperation|not authorized|AuthorizationError"; then
+      warn "$api_label" "IAM permission denied — results may be incomplete"
+      add_issue "Missing IAM permission for $api_label → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)" "P1"
+      echo ""
+      return 1
+    fi
+    echo "$result"
+    return "$rc"
+  fi
+  echo "$result"
+}
+
+if "$VALIDATE_MODE"; then
+  section "HyperPod Pre-Creation Validation"
+  echo -e "Region:  ${BOLD}${REGION}${NC}"
+  echo -e "Caller:  ${BOLD}${CALLER_ARN}${NC}"
+  echo -e "Time:    $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+
+  if [[ -n "$VALIDATE_SG_IDS" ]]; then
+    header "V1. Security Group Rules"
+    for SG in $(echo "$VALIDATE_SG_IDS" | tr ',' ' '); do
+      SG_JSON=$(aws_check "describe-sg-$SG" aws ec2 describe-security-groups \
+        --group-ids "$SG" --region "$REGION" --output json) || continue
+
+      _SG_CHECK_OUT=$(echo "$SG_JSON" | check_sg_self_ref "$SG")
+      while IFS=: read -r level check msg; do
+        [[ -z "$level" ]] && continue
+        case "$level" in
+          PASS) pass "$msg" ;;
+          FAIL)
+            fail "$msg"
+            add_issue "SG $SG missing $check self-ref → references/cluster-diagnostics-detail.md § A (EFA Health Checks)" "P0"
+            ;;
+          WARN) warn "$msg" ;;
+        esac
+      done <<< "$_SG_CHECK_OUT"
+    done
+  fi
+
+  if [[ -n "$VALIDATE_SUBNET_IDS" ]]; then
+    header "V2. Subnet Configuration"
+    IFS=',' read -ra _subnet_ids <<< "$VALIDATE_SUBNET_IDS"
+    SUB_JSON=$(aws_check "describe-subnets" aws ec2 describe-subnets \
+      --subnet-ids "${_subnet_ids[@]}" \
+      --region "$REGION" --output json) || SUB_JSON='{"Subnets":[]}'
+
+    echo "$SUB_JSON" | python3 -c "
+import sys, json
+subnets = json.load(sys.stdin).get('Subnets', [])
+vpcs = set()
+azs = set()
+for s in subnets:
+    sid = s.get('SubnetId', '?')
+    vpc = s.get('VpcId', '?')
+    az = s.get('AvailabilityZone', '?')
+    free = s.get('AvailableIpAddressCount', 0)
+    vpcs.add(vpc)
+    azs.add(az)
+    status = 'LOW' if free < 10 else 'OK'
+    print(f'SUBNET:{sid}:{vpc}:{az}:{free}:{status}')
+print(f'VPC_COUNT:{len(vpcs)}')
+print(f'AZ_COUNT:{len(azs)}')
+" 2>/dev/null | while IFS=: read -r tag rest; do
+      case "$tag" in
+        SUBNET)
+          IFS=: read -r sid _vpc az free status <<< "$rest"
+          if [[ "$status" == "LOW" ]]; then
+            warn "Subnet $sid (AZ=$az) — only $free IPs available"
+          else
+            pass "Subnet $sid" "AZ=$az FreeIPs=$free"
+          fi
+          ;;
+        VPC_COUNT)
+          if [[ "$rest" -gt 1 ]]; then
+            fail "Subnets are in DIFFERENT VPCs — all must be in same VPC"
+            add_issue "Subnets in different VPCs → references/cluster-diagnostics-detail.md § B (Capacity & AZ)" "P0"
+          else
+            pass "All subnets in same VPC"
+          fi
+          ;;
+        AZ_COUNT)
+          info "Subnets span $rest availability zone(s)"
+          ;;
+      esac
+    done
+  fi
+
+  if [[ -n "$VALIDATE_IAM_ROLE" ]]; then
+    header "V3. IAM Execution Role"
+    ROLE_NAME=$(echo "$VALIDATE_IAM_ROLE" | awk -F/ '{print $NF}')
+    ROLE_INFO=$(aws_check "get-role" aws iam get-role --role-name "$ROLE_NAME" --output json) || ROLE_INFO=""
+    if [[ -n "$ROLE_INFO" ]]; then
+      pass "IAM role exists" "$ROLE_NAME"
+      TRUST_SM=$(echo "$ROLE_INFO" | python3 -c "
+import sys,json
+doc=json.load(sys.stdin).get('Role',{}).get('AssumeRolePolicyDocument',{})
+stmts=doc.get('Statement',[])
+for s in stmts:
+    p=s.get('Principal',{})
+    svc=p.get('Service',[]) if isinstance(p.get('Service'), list) else [p.get('Service','')]
+    if 'sagemaker.amazonaws.com' in svc:
+        print('true')
+        break
+else:
+    print('false')
+" 2>/dev/null)
+      if [[ "$TRUST_SM" == "true" ]]; then
+        pass "Trust policy" "allows sagemaker.amazonaws.com"
+      else
+        fail "Trust policy" "missing sagemaker.amazonaws.com — cluster creation will fail"
+        add_issue "IAM execution role trust policy missing sagemaker.amazonaws.com → references/cluster-diagnostics-detail.md § H (CloudFormation Errors / SLR)" "P0"
+      fi
+
+      # Managed policies attached to the role — check cluster + SSM policies
+      POLICIES=$(aws_check "list-attached-role-policies-$ROLE_NAME" \
+        aws iam list-attached-role-policies --role-name "$ROLE_NAME" \
+        --query 'AttachedPolicies[*].PolicyArn' --output text) || POLICIES=""
+      if [[ -n "$POLICIES" ]]; then
+        if echo "$POLICIES" | grep -q "AmazonSageMakerClusterInstanceRolePolicy"; then
+          pass "Managed policy" "AmazonSageMakerClusterInstanceRolePolicy attached"
+        else
+          warn "Managed policy" "AmazonSageMakerClusterInstanceRolePolicy not attached — cluster bootstrap will fail"
+          add_issue "IAM execution role missing AmazonSageMakerClusterInstanceRolePolicy → references/cluster-diagnostics-detail.md § H (CloudFormation Errors / SLR)" "P0"
+        fi
+        if echo "$POLICIES" | grep -q "AmazonSSMManagedInstanceCore"; then
+          pass "Managed policy" "AmazonSSMManagedInstanceCore attached (SSM access)"
+        else
+          warn "Managed policy" "AmazonSSMManagedInstanceCore not attached — SSM node access will not work"
+          add_issue "IAM execution role missing AmazonSSMManagedInstanceCore → references/cluster-diagnostics-detail.md § F (SSM Connectivity)" "P1"
+        fi
+      fi
+
+      # Managed policies attached to the role — check cluster + SSM policies
+      POLICIES=$(aws_check "list-attached-role-policies-$ROLE_NAME" \
+        aws iam list-attached-role-policies --role-name "$ROLE_NAME" \
+        --query 'AttachedPolicies[*].PolicyArn' --output text) || POLICIES=""
+      if [[ -n "$POLICIES" ]]; then
+        if echo "$POLICIES" | grep -q "AmazonSageMakerClusterInstanceRolePolicy"; then
+          pass "Managed policy" "AmazonSageMakerClusterInstanceRolePolicy attached"
+        else
+          warn "Managed policy" "AmazonSageMakerClusterInstanceRolePolicy not attached — cluster bootstrap will fail"
+          add_issue "IAM execution role missing AmazonSageMakerClusterInstanceRolePolicy → references/cluster-diagnostics-detail.md § H (CloudFormation Errors / SLR)" "P0"
+        fi
+        if echo "$POLICIES" | grep -q "AmazonSSMManagedInstanceCore"; then
+          pass "Managed policy" "AmazonSSMManagedInstanceCore attached (SSM access)"
+        else
+          warn "Managed policy" "AmazonSSMManagedInstanceCore not attached — SSM node access will not work"
+          add_issue "IAM execution role missing AmazonSSMManagedInstanceCore → references/cluster-diagnostics-detail.md § F (SSM Connectivity)" "P1"
+        fi
+      fi
+    else
+      fail "IAM role" "cannot find role '$ROLE_NAME'"
+      add_issue "IAM execution role not found → references/cluster-diagnostics-detail.md § H (CloudFormation Errors / SLR)" "P0"
+    fi
+  fi
+
+  if [[ -n "$VALIDATE_SUBNET_IDS" ]]; then
+    header "V4. VPC Endpoints"
+    FIRST_SUBNET=$(echo "$VALIDATE_SUBNET_IDS" | cut -d, -f1)
+    VPC_FOR_EP=$(aws ec2 describe-subnets --subnet-ids "$FIRST_SUBNET" \
+      --region "$REGION" --query 'Subnets[0].VpcId' --output text 2>/dev/null || echo "")
+    if [[ -n "$VPC_FOR_EP" && "$VPC_FOR_EP" != "None" ]]; then
+      ENDPOINTS=$(aws ec2 describe-vpc-endpoints \
+        --filters "Name=vpc-id,Values=$VPC_FOR_EP" \
+        --region "$REGION" \
+        --query "VpcEndpoints[?State==\`available\`].ServiceName" \
+        --output text 2>/dev/null || echo "")
+      for SVC in s3 ssm ssmmessages ec2messages; do
+        if echo "$ENDPOINTS" | grep -qE "(^|[.])${SVC}($|[[:space:]])"; then
+          pass "VPC endpoint: $SVC"
+        else
+          warn "VPC endpoint: $SVC" "not found — needed for private VPC clusters"
+          add_issue "Missing VPC endpoint for $SVC → references/cluster-diagnostics-detail.md § C (Lifecycle Scripts)" "P2"
+        fi
+      done
+    fi
+  fi
+
+  # V5. Per-AZ instance-type capacity
+  if [[ -n "$VALIDATE_INSTANCE_TYPE" && -n "$VALIDATE_SUBNET_IDS" ]]; then
+    header "V5. Instance-Type Capacity per AZ"
+    # Strip ml. prefix for EC2 API
+    EC2_TYPE="${VALIDATE_INSTANCE_TYPE#ml.}"
+
+    # Get AZ offerings for this instance type
+    AZ_OFFERINGS=$(aws_check "describe-instance-type-offerings-$EC2_TYPE" \
+      aws ec2 describe-instance-type-offerings \
+      --location-type availability-zone \
+      --filters "Name=instance-type,Values=${EC2_TYPE}" \
+      --region "$REGION" \
+      --query 'InstanceTypeOfferings[*].Location' --output text) || AZ_OFFERINGS=""
+
+    if [[ -z "$AZ_OFFERINGS" ]]; then
+      fail "Instance type $VALIDATE_INSTANCE_TYPE" "not offered in region $REGION"
+      add_issue "$VALIDATE_INSTANCE_TYPE is not offered in any AZ in $REGION → references/capacity-planning.md" "P0"
+    else
+      info "$VALIDATE_INSTANCE_TYPE available in AZ(s): $AZ_OFFERINGS"
+
+      # Check each provided subnet's AZ against the offerings list
+      IFS=',' read -ra _subnet_ids <<< "$VALIDATE_SUBNET_IDS"
+      SUB_AZ_JSON=$(aws_check "describe-subnets-validate" aws ec2 describe-subnets \
+        --subnet-ids "${_subnet_ids[@]}" \
+        --region "$REGION" \
+        --query 'Subnets[*].{SubnetId:SubnetId,AZ:AvailabilityZone}' --output json) || SUB_AZ_JSON="[]"
+
+      MATCHED=0
+      while IFS=$'\t' read -r sid az; do
+        [[ -z "$sid" ]] && continue
+        if echo "$AZ_OFFERINGS" | tr '\t' '\n' | grep -qx "$az"; then
+          pass "Subnet $sid (AZ=$az)" "$VALIDATE_INSTANCE_TYPE is available"
+          MATCHED=$((MATCHED+1))
+        else
+          fail "Subnet $sid (AZ=$az)" "$VALIDATE_INSTANCE_TYPE NOT offered here"
+          add_issue "Subnet $sid AZ=$az does not offer $VALIDATE_INSTANCE_TYPE → references/capacity-planning.md" "P0"
+        fi
+      done < <(echo "$SUB_AZ_JSON" | python3 -c "
+import sys, json
+for s in json.load(sys.stdin):
+    print(f\"{s.get('SubnetId','')}\t{s.get('AZ','')}\")
+" 2>/dev/null)
+
+      if [[ $MATCHED -eq 0 ]]; then
+        warn "No provided subnet is in an AZ that offers $VALIDATE_INSTANCE_TYPE — cluster creation will fail with Insufficient capacity / No subnets in the capacity AZ"
+      fi
+    fi
+  fi
+
+  # V6. S3 lifecycle scripts
+  if [[ -n "$VALIDATE_S3_URI" ]]; then
+    header "V6. S3 Lifecycle Scripts"
+    # Normalize URI: must start with s3:// and end with /
+    if [[ ! "$VALIDATE_S3_URI" =~ ^s3:// ]]; then
+      fail "S3 URI" "must start with s3:// (got '$VALIDATE_S3_URI')"
+      add_issue "S3 URI is not a valid s3:// URI → references/lifecycle-scripts.md" "P0"
+    else
+      S3_URI_NORM="${VALIDATE_S3_URI%/}/"
+      info "S3 URI: $S3_URI_NORM"
+
+      # List bucket contents
+      S3_LIST=$(aws_check "s3-ls-$S3_URI_NORM" \
+        aws s3 ls "$S3_URI_NORM" --region "$REGION") || S3_LIST=""
+
+      if [[ -z "$S3_LIST" ]]; then
+        fail "S3 access" "cannot list $S3_URI_NORM — bucket missing, permissions denied, or empty prefix"
+        add_issue "S3 URI not accessible or empty: $S3_URI_NORM → references/lifecycle-scripts.md" "P0"
+      else
+        pass "S3 access" "prefix is listable"
+
+        # on_create.sh must exist (required entry script)
+        if echo "$S3_LIST" | grep -q "on_create.sh"; then
+          pass "on_create.sh" "entry script present"
+
+          # Download and inspect the file
+          TMPFILE=$(mktemp)
+          if aws s3 cp "${S3_URI_NORM}on_create.sh" "$TMPFILE" \
+               --region "$REGION" --only-show-errors 2>/dev/null; then
+            if file "$TMPFILE" | grep -q "CRLF"; then
+              fail "on_create.sh" "has Windows CRLF line endings — will fail on Linux"
+              add_issue "on_create.sh has CRLF line endings → references/lifecycle-scripts.md" "P0"
+            else
+              pass "on_create.sh" "Unix line endings"
+            fi
+            if head -1 "$TMPFILE" | grep -q "^#!"; then
+              pass "on_create.sh" "shebang present"
+            else
+              warn "on_create.sh" "missing shebang (#!/bin/bash)"
+              add_issue "on_create.sh missing shebang → references/lifecycle-scripts.md" "P1"
+            fi
+          else
+            warn "on_create.sh" "could not download for inspection"
+          fi
+          rm -f "$TMPFILE"
+        else
+          fail "on_create.sh" "entry script NOT FOUND at $S3_URI_NORM — cluster creation will fail"
+          add_issue "Missing on_create.sh at $S3_URI_NORM → references/lifecycle-scripts.md" "P0"
+        fi
+
+        # Orchestrator-specific script (one of these should be present)
+        if   echo "$S3_LIST" | grep -q "lifecycle_script.py"; then
+          pass "Orchestrator script" "lifecycle_script.py present (Slurm)"
+        elif echo "$S3_LIST" | grep -q "on_create_main.sh"; then
+          pass "Orchestrator script" "on_create_main.sh present (EKS)"
+        else
+          warn "Orchestrator script" "neither lifecycle_script.py (Slurm) nor on_create_main.sh (EKS) found at $S3_URI_NORM"
+          add_issue "Missing orchestrator-specific lifecycle script at $S3_URI_NORM → references/lifecycle-scripts.md" "P1"
+        fi
+      fi
+    fi
+  fi
+
+  echo ""
+  echo -e "${BOLD}========================================${NC}"
+  echo -e "${BOLD}       VALIDATION SUMMARY               ${NC}"
+  echo -e "${BOLD}========================================${NC}"
+  echo ""
+  echo -e "  Results: ${RED}${CRITICAL_FAILURES} critical${NC} | ${YELLOW}${WARNINGS} warnings${NC}"
+  echo -e "  Mode:    READ-ONLY (no changes made; each [FAIL] points to a references section)"
+  echo ""
+  if [[ ${#ISSUES_FOUND[@]} -gt 0 ]]; then
+    echo -e "${BOLD}  Issues:${NC}"
+    for priority in P0 P1 P2; do
+      for issue in "${ISSUES_FOUND[@]}"; do
+        if [[ "$issue" == "${priority}|"* ]]; then
+          desc="${issue#*|}"
+          case "$priority" in
+            P0) echo -e "    ${RED}[${priority}]${NC} $desc" ;;
+            P1) echo -e "    ${YELLOW}[${priority}]${NC} $desc" ;;
+            P2) echo -e "    [${priority}] $desc" ;;
+          esac
+        fi
+      done
+    done
+    echo ""
+  fi
+  if [[ $CRITICAL_FAILURES -eq 0 ]]; then
+    echo -e "  ${GREEN}${BOLD}Pre-flight validation passed. Safe to create cluster.${NC}"
+  else
+    echo -e "  ${RED}${BOLD}Fix P0 issues above before creating the cluster.${NC}"
+  fi
+  echo ""
+  exit "$([[ $CRITICAL_FAILURES -eq 0 ]] && echo 0 || echo 1)"
+fi
+
+section "HyperPod Cluster Diagnostics (read-only)"
+echo -e "Cluster: ${BOLD}${CLUSTER}${NC}"
+echo -e "Region:  ${BOLD}${REGION}${NC}"
+echo -e "Time:    $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+echo -e "${CYAN}   No cluster state will be modified. Each issue line below includes a${NC}"
+echo -e "${CYAN}   pointer to references/cluster-diagnostics-detail.md for remediation.${NC}"
+
+header "1. Cluster Identity & Status"
+
+CLUSTER_JSON=$(aws sagemaker describe-cluster \
+  --cluster-name "$CLUSTER" \
+  --region "$REGION" \
+  --cli-read-timeout 30 \
+  --output json 2>&1) || {
+  echo -e "${RED}ERROR: Could not describe cluster '$CLUSTER' in region '$REGION'${NC}"
+  echo "$CLUSTER_JSON" | head -3
+  echo ""
+  if echo "$CLUSTER_JSON" | grep -qiE "ResourceNotFound|Cluster with name .* not found"; then
+    echo "Available clusters in $REGION:"
+    aws sagemaker list-clusters --region "$REGION" \
+      --query 'ClusterSummaries[*].{Name:ClusterName,Status:ClusterStatus}' \
+      --output table 2>/dev/null || echo "  (unable to list clusters — check IAM)"
+  else
+    echo "Verify:"
+    echo "  1. Cluster name is correct (use: aws sagemaker list-clusters --region $REGION)"
+    echo "  2. Region is correct"
+    echo "  3. IAM permissions include sagemaker:DescribeCluster"
+  fi
+  exit 1
+}
+
+CLUSTER_ARN=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ClusterArn',''))" 2>/dev/null)
+CLUSTER_ID=$(echo "$CLUSTER_ARN" | awk -F'/' '{print $NF}')
+if [[ -z "$CLUSTER_ID" ]]; then
+  echo "ERROR: Could not extract cluster ID from ARN '$CLUSTER_ARN'. Verify the cluster name/ARN."
+  exit 1
+fi
+CLUSTER_STATUS=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ClusterStatus','unknown'))" 2>/dev/null)
+ORCHESTRATOR=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); o=d.get('Orchestrator',{}); print('EKS' if 'Eks' in o else 'Slurm')" 2>/dev/null)
+NODE_RECOVERY=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+# Prefer cluster-level NodeRecovery (the API's canonical location); fall back to
+# per-InstanceGroup only when top-level is absent. Reading only per-group yields
+# 'Unknown' on every cluster because the field is null at group level when set
+# cluster-wide.
+top=d.get('NodeRecovery')
+if top:
+    print(top)
+else:
+    groups=d.get('InstanceGroups',[])
+    recoveries={g.get('NodeRecovery') for g in groups if g.get('NodeRecovery')}
+    print(','.join(sorted(recoveries)) if recoveries else 'Unknown')
+" 2>/dev/null || echo "Unknown")
+
+info "ARN:          $CLUSTER_ARN"
+info "Cluster ID:   $CLUSTER_ID"
+info "Status:       $CLUSTER_STATUS"
+info "Orchestrator: $ORCHESTRATOR"
+info "NodeRecovery: $NODE_RECOVERY"
+
+# Flag auto-recovery disabled regardless of orchestrator. The check below used to
+# live inside the Slurm-only branch, which meant the finding never surfaced on EKS.
+if [[ "$NODE_RECOVERY" == *"None"* && "$NODE_RECOVERY" == *"Automatic"* ]]; then
+  warn "NodeRecovery" "mixed settings — some instance groups have recovery disabled"
+  add_issue "NodeRecovery disabled on some instance groups → references/cluster-diagnostics-detail.md § G (Node Replacement)" "P2"
+elif [[ "$NODE_RECOVERY" == *"None"* ]]; then
+  warn "NodeRecovery" "disabled on all instance groups — auto-replacement won't trigger"
+  add_issue "NodeRecovery disabled → references/cluster-diagnostics-detail.md § G (Node Replacement)" "P2"
+fi
+
+CREATION_TIME=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+ct=d.get('CreationTime','')
+print(ct if ct else '')
+" 2>/dev/null || echo "")
+
+LAST_MODIFIED_TIME=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+lm=d.get('LastModifiedTime','')
+print(lm if lm else '')
+" 2>/dev/null || echo "")
+
+STUCK_THRESHOLD_SECONDS=3600
+
+is_stuck() {
+  local creation_time="$1"
+  if [[ -z "$creation_time" ]]; then echo "false"; return; fi
+  CREATION_TS="$creation_time" THRESHOLD="$STUCK_THRESHOLD_SECONDS" python3 -c "
+import os
+from datetime import datetime, timezone
+ct = os.environ['CREATION_TS']
+threshold = int(os.environ['THRESHOLD'])
+try:
+    ct=ct.replace('+00:00','Z').rstrip('Z')
+    if '.' in ct: ct=ct[:ct.index('.')+7]
+    created=datetime.fromisoformat(ct).replace(tzinfo=timezone.utc)
+    elapsed=(datetime.now(timezone.utc)-created).total_seconds()
+    print('true' if elapsed > threshold else 'false')
+except (ValueError, TypeError):
+    # Unparseable timestamp — assume not stuck rather than abort the whole run.
+    print('false')
+" 2>/dev/null || echo "false"
+}
+
+case "$CLUSTER_STATUS" in
+  InService)    pass "Cluster status" "InService" ;;
+  Creating)
+    STUCK=$(is_stuck "$CREATION_TIME")
+    if [[ "$STUCK" == "true" ]]; then
+      fail "Cluster status" "Creating for over 1 hour — likely stuck"
+      add_issue "Cluster stuck in Creating > 1hr → references/cluster-diagnostics-detail.md § E (Cluster Provisioning), § H (CloudFormation)" "P0"
+    else
+      warn "Cluster status" "Creating — cluster is still being provisioned"
+      add_issue "Cluster still creating → references/cluster-diagnostics-detail.md § E (Cluster Provisioning)" "P1"
+    fi ;;
+  Updating)
+    STUCK=$(is_stuck "${LAST_MODIFIED_TIME:-$CREATION_TIME}")
+    if [[ "$STUCK" == "true" ]]; then
+      fail "Cluster status" "Updating — check if operation is stuck"
+      add_issue "Cluster may be stuck Updating → references/cluster-diagnostics-detail.md § E (Cluster Provisioning), § H (CloudFormation)" "P1"
+    else
+      warn "Cluster status" "Updating — cluster operation in progress"
+    fi ;;
+  Failed)       fail "Cluster status" "Failed — check events and CloudFormation"; add_issue "Cluster FAILED → references/cluster-diagnostics-detail.md § E (Cluster Provisioning), § H (CloudFormation)" "P0" ;;
+  Deleting)
+    STUCK=$(is_stuck "${LAST_MODIFIED_TIME:-$CREATION_TIME}")
+    if [[ "$STUCK" == "true" ]]; then
+      warn "Cluster status" "Deleting for extended time — may be blocked by VPC ENI dependencies"
+      add_issue "Cluster stuck Deleting → references/cluster-diagnostics-detail.md § E (Cluster Provisioning)" "P1"
+    else
+      warn "Cluster status" "Deleting"
+    fi ;;
+  RollingBack)  warn "Cluster status" "RollingBack — update is being rolled back"; add_issue "Cluster RollingBack → references/cluster-diagnostics-detail.md § J (AMI & Cluster Updates)" "P1" ;;
+  *RollbackFailed*|*MaintenanceFailed*)
+    fail "Cluster status" "$CLUSTER_STATUS — cluster is stuck in a non-recoverable state"
+    add_issue "Cluster stuck in $CLUSTER_STATUS → references/cluster-diagnostics-detail.md § J (AMI & Cluster Updates)" "P0" ;;
+  *)            warn "Cluster status" "$CLUSTER_STATUS" ;;
+esac
+
+EKS_NAME=""
+if [[ "$ORCHESTRATOR" == "EKS" ]]; then
+  EKS_NAME=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+arn=d.get('Orchestrator',{}).get('Eks',{}).get('ClusterArn','')
+print(arn.split('/')[-1] if arn else '')
+" 2>/dev/null || echo "")
+  if [[ -n "$EKS_NAME" ]]; then
+    info "EKS Cluster:  $EKS_NAME"
+  fi
+fi
+
+header "2. Instance Groups & Node Health"
+
+echo "$CLUSTER_JSON" | python3 -c "
+import sys, json
+d = json.load(sys.stdin)
+groups = d.get('InstanceGroups', [])
+if not groups:
+    print('  No instance groups found')
+else:
+    for g in groups:
+        name = g.get('InstanceGroupName', '?')
+        itype = g.get('InstanceType', '?')
+        target = g.get('TargetCount', 0)
+        current = g.get('CurrentCount', 0)
+        status = g.get('Status', g.get('InstanceGroupStatus', '?'))
+        recovery = g.get('NodeRecovery', '?')
+        threads = g.get('ThreadsPerCore', '?')
+        # TargetStateCount is the count the service is working toward when a
+        # resize is in flight; print when it differs from TargetCount.
+        tstate = g.get('TargetStateCount', None)
+        print(f'  {name}: type={itype} target={target} current={current} status={status} recovery={recovery} threads/core={threads}')
+        if tstate is not None and tstate != target:
+            print(f'    TargetStateCount={tstate} (resize in progress)')
+        if current < target:
+            print(f'    Current count ({current}) < target ({target}) — instances may still be provisioning or failed')
+" 2>/dev/null
+
+# Check node-level details. Paginate — default page is small and large clusters
+# silently truncate, which would break dangling-node reconciliation below.
+fetch_all_cluster_nodes_cd() {
+  local merged='[]' token='' page_json combined i=0
+  local max_pages=200  # 200 × 100 = 20 000 nodes, supports 7k+ clusters
+  while (( i < max_pages )); do
+    # Only pass --next-token if it parses as a safe base64/URL-safe string;
+    # sending garbage would produce ValidationException / BadRequest.
+    if [[ -n "$token" && "$token" =~ ^[a-zA-Z0-9/+]*={0,2}$ ]]; then
+      page_json=$(aws sagemaker list-cluster-nodes \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 --next-token "$token" \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    else
+      page_json=$(aws sagemaker list-cluster-nodes \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    fi
+    if echo "$page_json" | grep -qiE "AccessDenied|not authorized|UnauthorizedAccess"; then
+      echo "__AUTH_DENIED__"
+      return 1
+    fi
+    # Merge via stdin (NUL-delimited blobs) instead of argv — argv is capped at
+    # ARG_MAX (~128KB on Linux), which fails at ~500 nodes of accumulated JSON.
+    # Large clusters (7k+) need this path to avoid silent truncation.
+    combined=$(printf '%s\0%s' "$merged" "$page_json" | python3 -c "
+import sys, json
+blob = sys.stdin.buffer.read()
+try:
+    a, b = blob.split(b'\0', 1)
+    merged = json.loads(a)
+    page = json.loads(b)
+except (json.JSONDecodeError, ValueError):
+    sys.exit(2)
+merged.extend(page.get('ClusterNodeSummaries', []))
+print(json.dumps(merged))
+print(page.get('NextToken', ''))
+" 2>/dev/null) || break
+    merged=$(printf '%s\n' "$combined" | sed -n '1p')
+    token=$(printf '%s\n'  "$combined" | sed -n '2p')
+    i=$((i+1))
+    [[ -z "$token" ]] && break
+  done
+  if (( i == max_pages )) && [[ -n "$token" ]]; then
+    # Surface truncation into the summary — a stderr-only warning hides behind
+    # an otherwise clean "Issues Found" count on very large clusters.
+    echo "WARN: list-cluster-nodes truncated at ${max_pages} pages (~$((max_pages*100)) nodes). Diagnostic sample is incomplete for very large clusters." >&2
+    add_issue "Node list truncated at ${max_pages} pages (~$((max_pages*100)) nodes); diagnostic sample incomplete → references/cluster-diagnostics-detail.md § E (Cluster Provisioning)" "P2"
+  fi
+  # Final wrap via stdin — argv path would hit ARG_MAX at ~500 nodes.
+  printf '%s' "$merged" | python3 -c "
+import sys, json
+try:
+    print(json.dumps({'ClusterNodeSummaries': json.loads(sys.stdin.read())}))
+except json.JSONDecodeError:
+    print('{\"ClusterNodeSummaries\":[]}')
+" 2>/dev/null || echo '{"ClusterNodeSummaries":[]}'
+}
+
+NODE_LIST=$(fetch_all_cluster_nodes_cd)
+if [[ "$NODE_LIST" == "__AUTH_DENIED__" ]]; then
+  warn "list-cluster-nodes" "IAM permission denied — add sagemaker:ListClusterNodes to your role"
+  add_issue "Missing IAM permission for sagemaker:ListClusterNodes → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)" "P1"
+  NODE_LIST='{"ClusterNodeSummaries":[]}'
+fi
+
+TOTAL_NODES=$(echo "$NODE_LIST" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('ClusterNodeSummaries',[])))" 2>/dev/null || echo 0)
+info "Total nodes reported: $TOTAL_NODES"
+
+UNHEALTHY_NODES=$(echo "$NODE_LIST" | python3 -c "
+import sys, json
+nodes = json.load(sys.stdin).get('ClusterNodeSummaries', [])
+unhealthy = [n for n in nodes if n.get('InstanceStatus', {}).get('Status', '') not in ('Running', 'Pending')]
+if unhealthy:
+    for n in unhealthy:
+        nid = n.get('InstanceId', '?')
+        group = n.get('InstanceGroupName', '?')
+        status = n.get('InstanceStatus', {}).get('Status', '?')
+        msg = n.get('InstanceStatus', {}).get('Message', '')
+        print(f'  {nid} ({group}): {status} {msg}')
+    print(f'UNHEALTHY_COUNT={len(unhealthy)}')
+else:
+    print('UNHEALTHY_COUNT=0')
+" 2>/dev/null || echo "UNHEALTHY_COUNT=0")
+
+UNHEALTHY_COUNT=$(echo "$UNHEALTHY_NODES" | grep "^UNHEALTHY_COUNT=" | cut -d= -f2)
+[[ -z "$UNHEALTHY_COUNT" ]] && UNHEALTHY_COUNT=0
+echo "$UNHEALTHY_NODES" | grep -v "^UNHEALTHY_COUNT=" || true
+
+if [[ "$UNHEALTHY_COUNT" -gt 0 ]]; then
+  warn "Node health" "$UNHEALTHY_COUNT unhealthy node(s)"
+  add_issue "$UNHEALTHY_COUNT unhealthy node(s) → references/cluster-diagnostics-detail.md § G (Node Replacement); delegate to hyperpod-node-debugger" "P1"
+
+  echo "$NODE_LIST" | python3 -c "
+import sys, json
+from collections import defaultdict
+nodes = json.load(sys.stdin).get('ClusterNodeSummaries', [])
+groups = defaultdict(lambda: {'total': 0, 'unhealthy': 0})
+for n in nodes:
+    g = n.get('InstanceGroupName', 'unknown')
+    groups[g]['total'] += 1
+    st = n.get('InstanceStatus', {}).get('Status', '')
+    if st not in ('Running', 'Pending', ''):
+        groups[g]['unhealthy'] += 1
+for g, c in groups.items():
+    if c['unhealthy'] > 0:
+        pct = int(c['unhealthy'] / c['total'] * 100) if c['total'] > 0 else 0
+        print(f'  [WARN] Group {g}: {c[\"unhealthy\"]}/{c[\"total\"]} unhealthy ({pct}%)')
+" 2>/dev/null
+
+elif [[ "$TOTAL_NODES" -eq 0 && "$CLUSTER_STATUS" == "InService" ]]; then
+  warn "Node health" "Cluster InService but 0 nodes reported"
+  add_issue "Cluster InService but no nodes → references/cluster-diagnostics-detail.md § E (Cluster Provisioning)" "P1"
+else
+  pass "Node health" "$TOTAL_NODES node(s), $UNHEALTHY_COUNT unhealthy"
+fi
+
+header "3. Cluster Events (Recent)"
+
+# Paginate up to 5 pages (500 events) so the event scan covers incident windows
+# longer than the default page. Long-lived clusters with rolling replacements
+# regularly generate >100 events.
+fetch_cluster_events_cd() {
+  local merged='[]' token='' page_json combined i=0 denied=0
+  while (( i < 5 )); do
+    # Validate token format before sending — avoid BadRequest on garbage tokens.
+    if [[ -n "$token" && "$token" =~ ^[a-zA-Z0-9/+]*={0,2}$ ]]; then
+      page_json=$(aws sagemaker list-cluster-events \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 --next-token "$token" \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    else
+      page_json=$(aws sagemaker list-cluster-events \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    fi
+    if echo "$page_json" | grep -qi "AccessDenied\|not authorized"; then
+      denied=1
+      break
+    fi
+    combined=$(python3 -c "
+import sys, json
+try:
+    prev = json.loads(sys.argv[1])
+    page = json.loads(sys.argv[2])
+except json.JSONDecodeError:
+    # Malformed page response — stop paginating; caller falls through on break.
+    sys.exit(2)
+prev.extend(page.get('ClusterEventSummaries', []))
+print(json.dumps(prev))
+print(page.get('NextToken',''))
+" "$merged" "$page_json" 2>/dev/null) || break
+
+    merged=$(printf '%s\n' "$combined" | sed -n '1p')
+
+    token=$(printf '%s\n'  "$combined" | sed -n '2p')
+    i=$((i+1))
+    [[ -z "$token" ]] && break
+  done
+  if (( denied )); then
+    echo "__AUTH_DENIED__"
+    return 1
+  fi
+  python3 -c "import sys, json; print(json.dumps({'ClusterEventSummaries': json.loads(sys.argv[1])}))" "$merged" \
+    2>/dev/null || echo '{"ClusterEventSummaries":[]}'
+}
+
+EVENTS_JSON=$(fetch_cluster_events_cd)
+if [[ "$EVENTS_JSON" == "__AUTH_DENIED__" ]]; then
+  warn "list-cluster-events" "IAM permission denied — add sagemaker:ListClusterEvents to your role"
+  EVENTS_JSON='{"ClusterEventSummaries":[]}'
+fi
+
+EVENT_COUNT=$(echo "$EVENTS_JSON" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('ClusterEventSummaries',[])))" 2>/dev/null || echo 0)
+
+if [[ "$EVENT_COUNT" -eq 0 ]]; then
+  info "No cluster events found"
+  if [[ "$ORCHESTRATOR" == "Slurm" ]]; then
+    info "(Cluster events may not be available for HyperPod Slurm clusters)"
+  fi
+else
+  echo "$EVENTS_JSON" | python3 -c "
+import sys, json
+events = json.load(sys.stdin).get('ClusterEventSummaries', [])
+
+# Issue pattern mapping
+ISSUE_PATTERNS = {
+    'EFA health checks': 'EFA health check failure → references/cluster-diagnostics-detail.md § A',
+    'Insufficient capacity': 'Capacity error → references/cluster-diagnostics-detail.md § B',
+    'No subnets in the capacity': 'AZ/subnet mismatch → references/cluster-diagnostics-detail.md § B',
+    'Lifecycle scripts did not run': 'Lifecycle script failure → references/cluster-diagnostics-detail.md § C',
+    'Lifecycle scripts execution timed out': 'Lifecycle script timeout → references/cluster-diagnostics-detail.md § C',
+    'network misconfiguration': 'Network misconfiguration → references/cluster-diagnostics-detail.md § A + § B',
+    'hardware failure': 'Hardware failure → delegate to node-debugger',
+    'Failed to provision': 'Provisioning failure → references/cluster-diagnostics-detail.md § B or § E',
+    'replace': 'Node replacement activity → references/cluster-diagnostics-detail.md § G',
+    'reboot': 'Node reboot activity → references/cluster-diagnostics-detail.md § G',
+}
+
+for e in events[:20]:
+    ts = str(e.get('EventTime', '?'))[:19]
+    etype = e.get('EventType', '?')
+    msg = e.get('Message', '?')[:120]
+    print(f'  [{ts}] {etype}: {msg}')
+
+    # Flag known issue patterns
+    msg_lower = (e.get('Message','') or '').lower()
+    for pattern, hint in ISSUE_PATTERNS.items():
+        if pattern.lower() in msg_lower:
+            print(f'    [ISSUE] {hint}')
+            break
+" 2>/dev/null
+fi
+
+header "4. VPC & Security Group Configuration"
+
+SUBNET_IDS=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+print(' '.join(d.get('VpcConfig',{}).get('Subnets',[])))
+" 2>/dev/null || echo "")
+
+SG_IDS=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+print(' '.join(d.get('VpcConfig',{}).get('SecurityGroupIds',[])))
+" 2>/dev/null || echo "")
+
+if [[ -z "$SUBNET_IDS" ]]; then
+  warn "VpcConfig" "No VpcConfig found in cluster"
+else
+  info "Subnets: $SUBNET_IDS"
+  info "Security Groups: $SG_IDS"
+
+  IFS=' ' read -ra _subnet_ids_arr <<< "$SUBNET_IDS"
+  SUBNET_JSON=$(aws ec2 describe-subnets \
+    --subnet-ids "${_subnet_ids_arr[@]}" \
+    --region "$REGION" \
+    --cli-read-timeout 30 \
+    --output json 2>&1) || {
+    SUB_ERR="$SUBNET_JSON"
+    if echo "$SUB_ERR" | grep -qi "AccessDenied\|UnauthorizedOperation\|not authorized"; then
+      warn "describe-subnets" "IAM permission denied — add ec2:DescribeSubnets to your role"
+    fi
+    SUBNET_JSON='{"Subnets":[]}'
+  }
+
+  _SUBNET_CHECK=$(echo "$SUBNET_JSON" | python3 -c "
+import sys, json
+subnets = json.load(sys.stdin).get('Subnets', [])
+vpcs = set()
+for s in subnets:
+    sid = s.get('SubnetId', '?')
+    vpc = s.get('VpcId', '?')
+    az = s.get('AvailabilityZone', '?')
+    free = s.get('AvailableIpAddressCount', 0)
+    flag = ' LOW IPs' if free < 10 else ''
+    print(f'  {sid}: VPC={vpc} AZ={az} FreeIPs={free}{flag}')
+    vpcs.add(vpc)
+if len(vpcs) > 1:
+    print('MULTI_VPC=true')
+    print('VPC_LIST=' + ','.join(vpcs))
+else:
+    print('MULTI_VPC=false')
+    v = vpcs.pop() if vpcs else '?'
+    print('VPC_ID=' + v)
+" 2>/dev/null || echo "")
+
+  while IFS= read -r line; do
+    if [[ "$line" == "MULTI_VPC=true" ]]; then
+      fail "Subnet VPC alignment" "Subnets are in DIFFERENT VPCs — all must be in the same VPC"
+      add_issue "Subnets in different VPCs → references/cluster-diagnostics-detail.md § B (Capacity & AZ)" "P0"
+    fi
+    if [[ "$line" != MULTI_VPC=* && "$line" != VPC_ID=* && "$line" != VPC_LIST=* ]]; then
+      echo "$line"
+    fi
+  done <<< "$_SUBNET_CHECK"
+
+  # Check SG self-referencing rules (EFA requirement)
+  # shellcheck disable=SC2086  # intentional word splitting on space-separated SG IDs
+  for SG in $SG_IDS; do
+    SG_RESULT=$(aws ec2 describe-security-groups \
+      --group-ids "$SG" \
+      --region "$REGION" \
+      --cli-read-timeout 30 \
+      --output json 2>&1)
+    if echo "$SG_RESULT" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+      warn "describe-security-groups" "IAM permission denied for $SG — SG check skipped"
+      continue
+    fi
+    SG_JSON="${SG_RESULT}"
+    [[ -z "$SG_JSON" || "$SG_JSON" == *"error"* ]] && SG_JSON='{"SecurityGroups":[]}'
+
+    _SG_CHECK=$(echo "$SG_JSON" | check_sg_self_ref "$SG")
+
+    while IFS= read -r line; do
+      [[ -z "$line" ]] && continue
+      level=$(echo "$line" | cut -d: -f1)
+      msg=$(echo "$line" | cut -d: -f2-)
+      case "$level" in
+        PASS) pass "$msg" ;;
+        FAIL) fail "$msg"
+              if echo "$msg" | grep -q "Inbound self-ref MISSING"; then
+                add_issue "Security group $SG inbound self-ref MISSING → references/cluster-diagnostics-detail.md § A (EFA Health Checks)" "P0"
+              elif echo "$msg" | grep -q "Outbound self-ref MISSING"; then
+                add_issue "Security group $SG outbound self-ref MISSING → references/cluster-diagnostics-detail.md § A (EFA Health Checks)" "P0"
+              elif echo "$msg" | grep -q "Outbound 0.0.0.0/0 missing"; then
+                add_issue "Security group $SG outbound 0.0.0.0/0 MISSING → references/cluster-diagnostics-detail.md § A (EFA Health Checks)" "P0"
+              else
+                add_issue "Security group $SG rule missing → references/cluster-diagnostics-detail.md § A (EFA Health Checks)" "P0"
+              fi
+              ;;
+        WARN) warn "$msg" ;;
+        SKIP) info "$msg" ;;
+      esac
+    done <<< "$_SG_CHECK"
+  done
+fi
+
+header "4b. Instance Quotas"
+
+INSTANCE_TYPES=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+types=set(g.get('InstanceType','') for g in d.get('InstanceGroups',[]))
+print(' '.join(t for t in types if t))
+" 2>/dev/null || echo "")
+
+if [[ -n "$INSTANCE_TYPES" ]]; then
+  for ITYPE in $INSTANCE_TYPES; do
+    # `|| true` so a throttle / permission / timeout failure doesn't kill the script under set -e.
+    QUOTA_RESULT=$(aws service-quotas list-service-quotas \
+      --service-code sagemaker \
+      --region "$REGION" \
+      --cli-read-timeout 15 \
+      --query "Quotas[?contains(QuotaName,'$ITYPE') && contains(QuotaName,'HyperPod')].{Code:QuotaCode,Value:Value,Name:QuotaName}" \
+      --output json 2>&1 || true)
+    if echo "$QUOTA_RESULT" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+      warn "list-service-quotas" "IAM permission denied — quota check skipped for $ITYPE"
+      QUOTA_RESULT='[]'
+    elif echo "$QUOTA_RESULT" | grep -qiE "TooManyRequestsException|ThrottlingException|RequestLimitExceeded"; then
+      warn "list-service-quotas" "Throttled — quota check skipped for $ITYPE"
+      QUOTA_RESULT='[]'
+    elif ! echo "$QUOTA_RESULT" | head -c 1 | grep -q '\['; then
+      # Any other error surfaces as non-JSON output; degrade gracefully.
+      warn "list-service-quotas" "API call failed for $ITYPE — quota check skipped"
+      QUOTA_RESULT='[]'
+    fi
+    QUOTA_CODE="${QUOTA_RESULT:-[]}"
+
+    QUOTA_VAL=$(echo "$QUOTA_CODE" | python3 -c "
+import sys,json
+q=json.load(sys.stdin)
+if q: print(f'{q[0].get(\"Name\",\"?\")}: {int(q[0].get(\"Value\",0))}')
+else: print('NOT_FOUND')
+" 2>/dev/null || echo "NOT_FOUND")
+
+    if [[ "$QUOTA_VAL" == "NOT_FOUND" ]]; then
+      info "Quota for $ITYPE: could not determine (check Service Quotas console)"
+    else
+      info "Quota: $QUOTA_VAL"
+    fi
+  done
+else
+  info "No instance types found in cluster config"
+fi
+
+if [[ "$ORCHESTRATOR" == "EKS" && -n "$EKS_NAME" ]]; then
+  header "5. EKS Configuration"
+
+  EKS_AUTH=$(aws eks describe-cluster \
+    --name "$EKS_NAME" \
+    --region "$REGION" \
+    --query 'cluster.accessConfig.authenticationMode' \
+    --output text 2>/dev/null || echo "unknown")
+
+  if [[ "$EKS_AUTH" == "CONFIG_MAP" ]]; then
+    fail "EKS auth mode" "CONFIG_MAP — not supported by HyperPod"
+    add_issue "EKS auth mode is CONFIG_MAP → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)" "P1"
+  elif [[ "$EKS_AUTH" == "API" || "$EKS_AUTH" == "API_AND_CONFIG_MAP" ]]; then
+    pass "EKS auth mode" "$EKS_AUTH"
+  else
+    warn "EKS auth mode" "Could not determine ($EKS_AUTH)"
+  fi
+
+  # Check access entries for current identity. AWS CLI paginates JSON output by
+  # token, so paginate explicitly to handle accounts with many principals.
+  info "Current IAM identity: $CALLER_ARN"
+
+  fetch_all_access_entries() {
+    local merged='[]' token='' page_json combined i=0
+    while (( i < 20 )); do
+      # Validate token format before sending — avoid BadRequest on garbage.
+      if [[ -n "$token" && "$token" =~ ^[a-zA-Z0-9/+]*={0,2}$ ]]; then
+        page_json=$(aws eks list-access-entries --cluster-name "$EKS_NAME" --region "$REGION" \
+          --next-token "$token" --output json 2>/dev/null) || break
+      else
+        page_json=$(aws eks list-access-entries --cluster-name "$EKS_NAME" --region "$REGION" \
+          --output json 2>/dev/null) || break
+      fi
+      combined=$(python3 -c "
+import sys, json
+prev = json.loads(sys.argv[1])
+page = json.loads(sys.argv[2])
+prev.extend(page.get('accessEntries', []))
+print(json.dumps(prev))
+print(page.get('nextToken',''))
+" "$merged" "$page_json" 2>/dev/null) || break
+
+      merged=$(printf '%s\n' "$combined" | sed -n '1p')
+
+      token=$(printf '%s\n'  "$combined" | sed -n '2p')
+      i=$((i+1))
+      [[ -z "$token" ]] && break
+    done
+    echo "$merged"
+  }
+  ACCESS_ENTRIES=$(fetch_all_access_entries)
+  [[ -z "$ACCESS_ENTRIES" ]] && ACCESS_ENTRIES='[]'
+
+  ENTRY_COUNT=$(echo "$ACCESS_ENTRIES" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo 0)
+  info "Access entries: $ENTRY_COUNT configured"
+
+  # Strip session name for role-based ARNs
+  CALLER_BASE=$(echo "$CALLER_ARN" | python3 -c "
+import sys
+arn = sys.stdin.read().strip()
+# Convert assumed-role ARN to role ARN for matching
+# arn:aws:sts::ACCOUNT:assumed-role/ROLE/SESSION -> arn:aws:iam::ACCOUNT:role/ROLE
+if ':assumed-role/' in arn:
+    parts = arn.split(':')
+    role_path = parts[-1].replace('assumed-role/', 'role/')
+    role_path = '/'.join(role_path.split('/')[:2])  # remove session name
+    parts[-1] = role_path
+    parts[2] = 'iam'
+    parts[3] = ''  # IAM ARNs have no region
+    print(':'.join(parts))
+else:
+    print(arn)
+" 2>/dev/null || echo "$CALLER_ARN")
+
+  HAS_ACCESS=$(echo "$ACCESS_ENTRIES" | CALLER_BASE_ENV="$CALLER_BASE" python3 -c "
+import sys, json, os
+entries = json.load(sys.stdin)
+caller = os.environ['CALLER_BASE_ENV']
+found = any(caller in str(e) for e in entries)
+print('true' if found else 'false')
+" 2>/dev/null || echo "false")
+
+  if [[ "$HAS_ACCESS" == "true" ]]; then
+    pass "EKS access entry" "current identity has an access entry"
+  else
+    warn "EKS access entry" "current identity ($CALLER_BASE) may not have an access entry — kubectl may fail"
+    add_issue "Current IAM identity may lack EKS access → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)" "P1"
+  fi
+
+  if command -v kubectl &>/dev/null; then
+    KUBECTL_TEST=$(kubectl cluster-info 2>&1 || true)
+    if echo "$KUBECTL_TEST" | grep -q "Kubernetes control plane\|running at"; then
+      pass "kubectl connectivity" "can reach EKS API server"
+
+      if kubectl get namespace aws-hyperpod &>/dev/null 2>&1; then
+        pass "aws-hyperpod namespace" "exists"
+      else
+        warn "aws-hyperpod namespace" "missing → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)"
+      fi
+
+      # Node count. Note: `wc -l` never fails; avoid `|| echo 0` which would produce "0\n0".
+      K8S_NODE_COUNT=$(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')
+      K8S_NODE_COUNT=${K8S_NODE_COUNT:-0}
+      info "Kubernetes nodes visible: $K8S_NODE_COUNT"
+
+      if [[ "$K8S_NODE_COUNT" -eq 0 && "$TOTAL_NODES" -gt 0 ]]; then
+        warn "K8s nodes" "0 K8s nodes but $TOTAL_NODES HyperPod nodes — nodes may not have registered with EKS"
+        add_issue "Nodes not visible in kubectl → references/cluster-diagnostics-detail.md § E (Cluster Provisioning)" "P1"
+      fi
+
+      HEALTH_LABELS=$(kubectl get nodes -o custom-columns='NODE:.metadata.name,HEALTH:.metadata.labels.sagemaker\.amazonaws\.com/node-health-status' --no-headers 2>/dev/null || true)
+      if [[ -n "$HEALTH_LABELS" ]]; then
+        UNHEALTHY_K8S=$(echo "$HEALTH_LABELS" | grep -v "<none>" | grep -viE "Schedulable$" || true)
+        if [[ -n "$UNHEALTHY_K8S" ]]; then
+          warn "EKS node health labels" "non-schedulable nodes detected:"
+          echo "$UNHEALTHY_K8S" | while IFS= read -r line; do info "  $line"; done
+          add_issue "EKS nodes with health issues → delegate to hyperpod-node-debugger skill; references/cluster-diagnostics-detail.md § G (Node Replacement)" "P1"
+        else
+          pass "EKS node health labels" "all nodes schedulable"
+        fi
+      fi
+
+      # Dangling node detection — nodes visible in EKS but not in HyperPod list
+      # (or vice versa). Happens after failed scale-up, rollback, or orphaned
+      # kubelet registrations.
+      if [[ "$K8S_NODE_COUNT" -gt 0 && "$TOTAL_NODES" -gt 0 ]]; then
+        HP_INSTANCES=$(echo "$NODE_LIST" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    iid=n.get('InstanceId','')
+    if iid: print(iid)
+" 2>/dev/null | sort -u)
+        EKS_INSTANCES=$(kubectl get nodes -l sagemaker.amazonaws.com/compute-type=hyperpod \
+          -o jsonpath='{range .items[*]}{.spec.providerID}{"\n"}{end}' 2>/dev/null \
+          | awk -F/ '{print $NF}' | grep -E '^i-' | sort -u || true)
+        if [[ -n "$HP_INSTANCES" && -n "$EKS_INSTANCES" ]]; then
+          DANGLING=$(comm -13 <(echo "$HP_INSTANCES") <(echo "$EKS_INSTANCES"))
+          ORPHANED=$(comm -23 <(echo "$HP_INSTANCES") <(echo "$EKS_INSTANCES"))
+          if [[ -n "$DANGLING" ]]; then
+            warn "Dangling nodes" "visible in EKS but not in HyperPod ($(echo "$DANGLING" | wc -l))"
+            echo "$DANGLING" | head -5 | while IFS= read -r iid; do info "  EKS-only: $iid"; done
+            add_issue "Dangling EKS nodes (present in kubectl, absent from list-cluster-nodes) → references/cluster-diagnostics-detail.md § K (Dangling Nodes & Cleanup)" "P1"
+          fi
+          if [[ -n "$ORPHANED" ]]; then
+            warn "Orphaned HyperPod nodes" "visible in HyperPod but not in EKS ($(echo "$ORPHANED" | wc -l))"
+            echo "$ORPHANED" | head -5 | while IFS= read -r iid; do info "  HyperPod-only: $iid"; done
+            add_issue "HyperPod nodes not registered in EKS → references/cluster-diagnostics-detail.md § E (Cluster Provisioning); delegate to hyperpod-node-debugger" "P1"
+          fi
+          [[ -z "$DANGLING" && -z "$ORPHANED" ]] && pass "Node reconciliation" "EKS and HyperPod views match"
+        fi
+      fi
+
+      # EKS add-on health — VPC CNI, CoreDNS, kube-proxy failures break pod networking.
+      # Add-on count is small in practice (<10) so a single page of 100 is always sufficient.
+      if [[ -n "$EKS_NAME" ]]; then
+        ADDON_JSON=$(aws eks list-addons --cluster-name "$EKS_NAME" --region "$REGION" \
+          --max-results 100 --output json 2>/dev/null || echo '{"addons":[]}')
+        ADDON_NAMES=$(echo "$ADDON_JSON" | python3 -c "
+import sys,json
+print('\n'.join(json.load(sys.stdin).get('addons',[])))
+" 2>/dev/null)
+        DEGRADED_ADDONS=""
+        while IFS= read -r addon; do
+          [[ -z "$addon" ]] && continue
+          A_STATUS=$(aws eks describe-addon --cluster-name "$EKS_NAME" --addon-name "$addon" \
+            --region "$REGION" --query 'addon.status' --output text 2>/dev/null || echo "UNKNOWN")
+          if [[ "$A_STATUS" != "ACTIVE" && "$A_STATUS" != "UPDATING" ]]; then
+            DEGRADED_ADDONS+="$addon($A_STATUS) "
+          fi
+        done <<< "$ADDON_NAMES"
+        if [[ -n "$DEGRADED_ADDONS" ]]; then
+          warn "EKS add-ons" "not ACTIVE: $DEGRADED_ADDONS"
+          add_issue "EKS add-on(s) degraded: $DEGRADED_ADDONS → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)" "P1"
+        else
+          [[ -n "$ADDON_NAMES" ]] && pass "EKS add-ons" "$(echo "$ADDON_NAMES" | wc -l) add-on(s) ACTIVE"
+        fi
+      fi
+
+      # aws-auth ConfigMap legacy check — deprecated but still load-bearing if cluster auth mode
+      # is API_AND_CONFIG_MAP or CONFIG_MAP. Misconfigured entries here can shadow access entries.
+      if [[ -n "$EKS_NAME" ]]; then
+        AUTH_MODE=$(aws eks describe-cluster --name "$EKS_NAME" --region "$REGION" \
+          --query 'cluster.accessConfig.authenticationMode' --output text 2>/dev/null || echo "")
+        if [[ "$AUTH_MODE" == "CONFIG_MAP" || "$AUTH_MODE" == "API_AND_CONFIG_MAP" ]]; then
+          if kubectl -n kube-system get configmap aws-auth >/dev/null 2>&1; then
+            AUTH_ENTRIES=$(kubectl -n kube-system get configmap aws-auth -o jsonpath='{.data.mapRoles}' 2>/dev/null | grep -c "^" || true)
+            AUTH_ENTRIES=${AUTH_ENTRIES:-0}
+            info "aws-auth ConfigMap: $AUTH_ENTRIES mapRoles entries (auth mode: $AUTH_MODE)"
+            if [[ "$AUTH_MODE" == "API_AND_CONFIG_MAP" ]]; then
+              warn "aws-auth ConfigMap" "both ConfigMap and access entries in use — ConfigMap entries can shadow access entries; recommend migrating to API-only mode"
+            fi
+          fi
+        fi
+      fi
+    else
+      warn "kubectl connectivity" "cannot reach EKS API — check kubeconfig and access entries"
+      add_issue "kubectl cannot reach EKS → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)" "P1"
+    fi
+  else
+    info "kubectl not installed — skipping Kubernetes checks"
+  fi
+else
+  header "5. Slurm Checks"
+  info "Orchestrator: Slurm"
+
+  # NodeRecovery warn/issue now emitted at the top-level section 1 — no need to
+  # repeat here for Slurm. Keep this branch for the PASS confirmation only.
+  if [[ "$NODE_RECOVERY" == *"Automatic"* ]] && [[ "$NODE_RECOVERY" != *"None"* ]]; then
+    pass "NodeRecovery" "enabled on all instance groups"
+  fi
+
+  if command -v session-manager-plugin &>/dev/null && [[ -n "$CLUSTER_ID" ]]; then
+    header "5b. Slurm Controller Health (via SSM)"
+    HEAD_NODE_ID=$(echo "$NODE_LIST" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    g=n.get('InstanceGroupName','').lower()
+    if any(x in g for x in ['controller','head','master','login']):
+        print(n.get('InstanceId',''))
+        break
+else:
+    if nodes:
+        print(nodes[0].get('InstanceId',''))
+" 2>/dev/null || echo "")
+
+    if [[ -n "$HEAD_NODE_ID" ]]; then
+      HEAD_GROUP=$(echo "$NODE_LIST" | HEAD_NODE_ID_ENV="$HEAD_NODE_ID" python3 -c "
+import sys,json,os
+target_id = os.environ['HEAD_NODE_ID_ENV']
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    if n.get('InstanceId','') == target_id:
+        print(n.get('InstanceGroupName',''))
+        break
+" 2>/dev/null || echo "")
+      if [[ -z "$HEAD_GROUP" ]]; then
+        warn "Controller node" "could not resolve instance-group name — SSM check skipped"
+        HEAD_NODE_ID=""
+      fi
+    fi
+    if [[ -n "$HEAD_NODE_ID" ]]; then
+      SSM_TARGET="sagemaker-cluster:${CLUSTER_ID}_${HEAD_GROUP}-${HEAD_NODE_ID}"
+      info "Controller node: $HEAD_NODE_ID ($HEAD_GROUP)"
+      info "SSM target: $SSM_TARGET"
+
+      _slurm_nonce=$(date +%s%N 2>/dev/null || echo "$RANDOM")
+      # Validate nonce is numeric to prevent injection in remote command
+      if [[ ! "$_slurm_nonce" =~ ^[0-9]+$ ]]; then
+        _slurm_nonce="$$"
+      fi
+      SLURM_SH=$(cat <<EOF
+scontrol show config >/dev/null 2>&1
+if [ \$? -eq 0 ]; then echo SLURM_OK_${_slurm_nonce}; else echo SLURM_DOWN_${_slurm_nonce}; fi
+echo NODES_START_${_slurm_nonce}
+sinfo -o '%N %T %30E' --noheader 2>/dev/null | head -20
+echo NODES_END_${_slurm_nonce}
+echo JOBS_START_${_slurm_nonce}
+squeue -o '%i %j %T %R' --noheader 2>/dev/null | grep -iE 'COMPLETING|CONFIGURING|PENDING' | head -10 || true
+echo JOBS_END_${_slurm_nonce}
+echo MUNGE_${_slurm_nonce}
+systemctl is-active munge 2>/dev/null || echo munge_inactive
+echo END_${_slurm_nonce}
+EOF
+)
+      STDOUT=$(ssm_run_on_node "$HEAD_NODE_ID" "$HEAD_GROUP" "$SLURM_SH" || echo "")
+
+      if [[ -n "$STDOUT" ]]; then
+        if echo "$STDOUT" | grep -q "SLURM_OK_${_slurm_nonce}"; then
+          pass "slurmctld" "responsive"
+        elif echo "$STDOUT" | grep -q "SLURM_DOWN_${_slurm_nonce}"; then
+          fail "slurmctld" "not responding — all Slurm operations blocked"
+          add_issue "slurmctld down on controller → references/cluster-operations.md § 9 Slurm-Specific Operations" "P0"
+        fi
+
+        SLURM_DOWN_NODES=$(echo "$STDOUT" | sed -n "/^NODES_START_${_slurm_nonce}\$/,/^NODES_END_${_slurm_nonce}\$/p" | grep -v "^NODES_" | grep -iE "down|drain|fail" || true)
+        if [[ -n "$SLURM_DOWN_NODES" ]]; then
+          warn "Slurm nodes with issues:"
+          echo "$SLURM_DOWN_NODES" | while IFS= read -r line; do info "  $line"; done
+          S_DOWN_COUNT=$(echo "$SLURM_DOWN_NODES" | grep -c . ; :)
+          S_DOWN_COUNT=${S_DOWN_COUNT:-0}
+          add_issue "$S_DOWN_COUNT Slurm node(s) down/drained → references/cluster-diagnostics-detail.md § G (Node Replacement); delegate to hyperpod-node-debugger" "P1"
+        else
+          pass "Slurm nodes" "all idle/alloc/mixed"
+        fi
+
+        STUCK_JOBS=$(echo "$STDOUT" | sed -n "/^JOBS_START_${_slurm_nonce}\$/,/^JOBS_END_${_slurm_nonce}\$/p" | grep -v "^JOBS_" || true)
+        if [[ -n "$STUCK_JOBS" ]]; then
+          warn "Stuck Slurm jobs detected:"
+          echo "$STUCK_JOBS" | while IFS= read -r line; do info "  $line"; done
+          add_issue "Stuck Slurm jobs → references/cluster-operations.md § 9 Slurm-Specific Operations" "P1"
+        fi
+
+        if echo "$STDOUT" | sed -n "/^MUNGE_${_slurm_nonce}\$/,/^END_${_slurm_nonce}\$/p" | grep -q "munge_inactive"; then
+          fail "munge" "authentication service not running — Slurm auth will fail"
+          add_issue "munge service inactive on controller → references/cluster-operations.md § 9 Slurm-Specific Operations" "P0"
+        fi
+      else
+        info "Could not get output from SSM on controller — check ssm:StartSession permission, session-manager-plugin, or node reachability"
+      fi
+    else
+      info "Could not identify controller node from node list"
+    fi
+  else
+    info "SSM plugin not available — Slurm checks require SSM access to controller"
+    info "Install SSM plugin to enable Slurm health checks"
+  fi
+fi
+
+header "6. SSM Readiness"
+
+if command -v session-manager-plugin &>/dev/null; then
+  if SSM_VERSION=$(session-manager-plugin --version 2>/dev/null); then
+    pass "SSM plugin installed" "version: $SSM_VERSION"
+  else
+    warn "SSM plugin" "installed but --version failed — plugin may be corrupt"
+    add_issue "SSM plugin installed but broken → references/cluster-diagnostics-detail.md § F (SSM Connectivity)" "P1"
+  fi
+else
+  warn "SSM plugin" "not installed — required for node access"
+  info "Install from: https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html"
+  add_issue "SSM plugin not installed → references/cluster-diagnostics-detail.md § F (SSM Connectivity)" "P2"
+fi
+
+if [[ -n "$CLUSTER_ID" && "$TOTAL_NODES" -gt 0 ]]; then
+  FIRST_NODE=$(echo "$NODE_LIST" | python3 -c "
+import sys, json
+nodes = json.load(sys.stdin).get('ClusterNodeSummaries', [])
+if nodes:
+    n = nodes[0]
+    nid = n.get('InstanceId', '?')
+    group = n.get('InstanceGroupName', '?')
+    print(f'{group}-{nid}')
+" 2>/dev/null || echo "")
+
+  if [[ -n "$FIRST_NODE" ]]; then
+    info "SSM target format: sagemaker-cluster:${CLUSTER_ID}_${FIRST_NODE}"
+    info "To connect: aws ssm start-session --target sagemaker-cluster:${CLUSTER_ID}_${FIRST_NODE} --region $REGION"
+  fi
+fi
+
+if [[ -n "$SUBNET_IDS" ]]; then
+  header "6b. VPC Endpoints"
+
+  FIRST_SUBNET=$(echo "$SUBNET_IDS" | awk '{print $1}')
+  VPC_FOR_ENDPOINTS=$(aws ec2 describe-subnets \
+    --subnet-ids "$FIRST_SUBNET" \
+    --region "$REGION" \
+    --cli-read-timeout 15 \
+    --query 'Subnets[0].VpcId' \
+    --output text 2>/dev/null || echo "")
+
+  if [[ -n "$VPC_FOR_ENDPOINTS" && "$VPC_FOR_ENDPOINTS" != "None" ]]; then
+    EP_RESULT=$(aws ec2 describe-vpc-endpoints \
+      --filters "Name=vpc-id,Values=$VPC_FOR_ENDPOINTS" \
+      --region "$REGION" \
+      --cli-read-timeout 15 \
+      --query "VpcEndpoints[?State==\`available\`].ServiceName" \
+      --output text 2>&1)
+    if echo "$EP_RESULT" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+      warn "describe-vpc-endpoints" "IAM permission denied — VPC endpoint check skipped"
+      EP_RESULT=""
+    fi
+    ENDPOINTS="${EP_RESULT}"
+
+    for SVC in s3 ssm ssmmessages ec2messages; do
+      if echo "$ENDPOINTS" | grep -qE "(^|[.])${SVC}($|[[:space:]])"; then
+        pass "VPC endpoint: $SVC"
+      else
+        warn "VPC endpoint: $SVC" "not found — needed for private VPC clusters"
+        add_issue "VPC endpoint not found for $SVC → references/cluster-diagnostics-detail.md § C (Lifecycle Scripts)" "P2"
+      fi
+    done
+  else
+    info "Could not determine VPC ID for endpoint check"
+  fi
+fi
+
+header "7. CloudWatch Logs"
+
+if [[ -n "$CLUSTER_ID" ]]; then
+  # CW log groups follow /aws/sagemaker/Clusters/<CLUSTER_NAME>/<CLUSTER_ID>,
+  # where <CLUSTER_NAME> is the human-readable name (not the ARN short-id).
+  CLUSTER_NAME_FOR_LOGS=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys, json
+try:
+    d = json.load(sys.stdin)
+    n = d.get('ClusterName', '')
+    print(n if n else '')
+except Exception:
+    print('')
+" 2>/dev/null)
+  # Fall back to the value the caller supplied, unless it looks like an ARN.
+  if [[ -z "$CLUSTER_NAME_FOR_LOGS" ]]; then
+    if [[ "$CLUSTER" == arn:aws:* ]]; then
+      CLUSTER_NAME_FOR_LOGS="$CLUSTER_ID"  # best-effort; will probe the prefix below
+    else
+      CLUSTER_NAME_FOR_LOGS="$CLUSTER"
+    fi
+  fi
+
+  LOG_GROUP="/aws/sagemaker/Clusters/${CLUSTER_NAME_FOR_LOGS}/${CLUSTER_ID}"
+
+  LOG_RESULT=$(aws logs describe-log-groups \
+    --log-group-name-prefix "$LOG_GROUP" \
+    --region "$REGION" \
+    --query 'logGroups[0].logGroupName' \
+    --output text 2>&1)
+  if echo "$LOG_RESULT" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+    warn "describe-log-groups" "IAM permission denied — CloudWatch log check skipped"
+    LOG_RESULT="None"
+  fi
+  LOG_EXISTS="${LOG_RESULT:-None}"
+
+  if [[ "$LOG_EXISTS" != "None" && -n "$LOG_EXISTS" ]]; then
+    pass "CloudWatch log group" "$LOG_GROUP"
+
+    # Use the server-side prefix filter so we're not restricted to the first
+    # page of streams (clusters with 100s of nodes have 100s of streams).
+    count_log_streams_by_prefix() {
+      local prefix="$1"
+      local merged='[]' token='' page_json combined i=0
+      while (( i < 20 )); do
+        # Validate token format before sending — avoid BadRequest on garbage.
+        if [[ -n "$token" && "$token" =~ ^[a-zA-Z0-9/+]*={0,2}$ ]]; then
+          page_json=$(aws logs describe-log-streams \
+            --log-group-name "$LOG_GROUP" --region "$REGION" \
+            --log-stream-name-prefix "$prefix" --limit 50 --next-token "$token" \
+            --output json 2>/dev/null) || break
+        else
+          page_json=$(aws logs describe-log-streams \
+            --log-group-name "$LOG_GROUP" --region "$REGION" \
+            --log-stream-name-prefix "$prefix" --limit 50 \
+            --output json 2>/dev/null) || break
+        fi
+        combined=$(python3 -c "
+import sys, json
+prev = json.loads(sys.argv[1])
+page = json.loads(sys.argv[2])
+prev.extend(s.get('logStreamName','') for s in page.get('logStreams', []))
+print(json.dumps(prev))
+print(page.get('nextToken',''))
+" "$merged" "$page_json" 2>/dev/null) || break
+
+        merged=$(printf '%s\n' "$combined" | sed -n '1p')
+
+        token=$(printf '%s\n'  "$combined" | sed -n '2p')
+        i=$((i+1))
+        [[ -z "$token" ]] && break
+      done
+      echo "$merged" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo 0
+    }
+
+    LC_COUNT=$(count_log_streams_by_prefix "LifecycleConfig")
+    HM_COUNT=$(count_log_streams_by_prefix "SagemakerHealthMonitoringAgent")
+
+    info "Lifecycle log streams: $LC_COUNT"
+    info "Health monitoring log streams: $HM_COUNT"
+
+    if [[ "$LC_COUNT" -eq 0 && "$CLUSTER_STATUS" != "Creating" ]]; then
+      warn "Lifecycle logs" "no lifecycle log streams found — scripts may not have run"
+    fi
+  else
+    warn "CloudWatch log group" "not found: $LOG_GROUP"
+    info "Logs may not be available if cluster creation failed early"
+    info "Check IAM execution role has CloudWatch Logs write permissions"
+    add_issue "CloudWatch log group not found → references/cluster-diagnostics-detail.md § C (Lifecycle Scripts)" "P2"
+  fi
+fi
+
+echo ""
+echo -e "${BOLD}========================================${NC}"
+echo -e "${BOLD}          DIAGNOSTIC SUMMARY            ${NC}"
+echo -e "${BOLD}========================================${NC}"
+echo ""
+
+echo -e "  Cluster:  ${BOLD}${CLUSTER}${NC} (${ORCHESTRATOR})"
+echo -e "  Status:   ${CLUSTER_STATUS}"
+echo -e "  Results:  ${RED}${CRITICAL_FAILURES} critical${NC} | ${YELLOW}${WARNINGS} warnings${NC}"
+echo -e "  Mode:     READ-ONLY (no changes made; each [FAIL] points to a references section)"
+echo ""
+
+if [[ ${#ISSUES_FOUND[@]} -gt 0 ]]; then
+  echo -e "${BOLD}  Issues Found (prioritized):${NC}"
+  for priority in P0 P1 P2; do
+    has_priority=false
+    for issue in "${ISSUES_FOUND[@]}"; do
+      if [[ "$issue" == "${priority}|"* ]]; then
+        if ! "$has_priority"; then
+          case "$priority" in
+            P0) echo -e "    ${RED}${BOLD}[$priority — Fix Immediately]${NC}" ;;
+            P1) echo -e "    ${YELLOW}${BOLD}[$priority — Fix Soon]${NC}" ;;
+            P2) echo -e "    ${BOLD}[$priority — Informational]${NC}" ;;
+          esac
+          has_priority=true
+        fi
+        echo -e "      → ${issue#*|}"
+      fi
+    done
+  done
+  echo ""
+fi
+
+if [[ $CRITICAL_FAILURES -eq 0 && $WARNINGS -eq 0 ]]; then
+  echo -e "  ${GREEN}${BOLD}All cluster-level checks passed.${NC}"
+  echo "  If issues persist, try:"
+  echo "    - hyperpod-node-debugger skill for per-node issues"
+  echo "    - hyperpod-nccl skill for NCCL/training issues"
+elif [[ $CRITICAL_FAILURES -eq 0 ]]; then
+  echo -e "  ${YELLOW}${BOLD}No critical issues, but $WARNINGS warning(s) found.${NC}"
+  echo "  Review [WARN] items above."
+else
+  echo -e "  ${RED}${BOLD}$CRITICAL_FAILURES critical issue(s) found.${NC}"
+  echo "  Fix [FAIL] items above. See SKILL.md for detailed resolution steps."
+fi
+echo ""
+
+exit "$([[ $CRITICAL_FAILURES -eq 0 ]] && echo 0 || echo 1)"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md b/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md
index 9ab32540..1033f02a 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md
+++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md
@@ -8,7 +8,7 @@
 | `kubectl must be configured for EKS clusters` | kubectl missing or wrong context                              | Run `aws eks update-kubeconfig --name <eks-cluster-name> --region <region>`. Get the EKS cluster name from `aws sagemaker describe-cluster` output (`Orchestrator.Eks.ClusterArn`) |
 | Cluster name from ARN not found               | ARN contains cluster ID, not name                             | Pass the full ARN to `--cluster` instead of extracting the ID portion. Alternatively, use `aws sagemaker list-clusters` to find the cluster name                                   |
 | No instance reports in S3                     | Node IAM role missing S3 permissions                          | Add `s3:GetObject`/`s3:PutObject` to node role for the report bucket                                                                                                               |
-| SSM connectivity failed                       | SSM agent down, missing IAM, or network                       | Check `systemctl status amazon-ssm-agent`, verify `AmazonSSMManagedInstanceCore` policy                                                                                            |
+| SSM connectivity failed                       | SSM agent down, missing IAM, or network                       | Check `systemctl status amazon-ssm-agent`                                                                                                                                          |
 | "Failed to detect shell prompt"               | Custom SSM session config (custom `.bashrc`, SSM preferences) | Not compatible without modifying prompt detection; use manual SSM sessions as workaround                                                                                           |
 | SSM throttling                                | Too many concurrent sessions                                  | Reduce `--max-workers`; automatic retry handles transient throttling                                                                                                               |
 | Nodes unresponsive                            | Node completely down                                          | Noted in report; other nodes' diagnostics may reveal pattern                                                                                                                       |
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/SKILL.md
new file mode 100644
index 00000000..ee10315e
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/SKILL.md
@@ -0,0 +1,271 @@
+---
+name: hyperpod-mfu-debugger
+description: >
+  Diagnose MFU (Model FLOPs Utilization) degradation in distributed GPU
+  and Trainium training on Amazon SageMaker HyperPod (Slurm and EKS).
+  Walks through Phase 0 context, Phase 1 decision-tree triage, and
+  Phase 2 deep dive into one of seven root causes: code regression,
+  CUDA or NCCL or OOM errors, hardware straggler, thermal or memory or
+  data-pipeline issues, periodic checkpoint dips, network variance, or
+  configuration tuning. Runs diagnostic commands via Systems Manager
+  and interprets the output with the user. Use when someone reports
+  MFU drops, training slowdowns, GPU underutilization, step-time
+  increases, stragglers, or throughput regressions on HyperPod, even
+  if "MFU" is not said explicitly. Do not use for cluster-creation
+  failures (hyperpod-cluster-debugger),
+  single-node faults without MFU symptoms (hyperpod-node-debugger),
+  NCCL hangs (hyperpod-nccl), Slurm node-state management
+  (hyperpod-slurm-debugger), or uneven-NCCL, FSx, or GPU-failure
+  triage (hyperpod-performance-debugger).
+metadata:
+  version: "1.0.0"
+---
+
+# Debug MFU Degradation on Amazon SageMaker HyperPod
+
+Structured workflow for diagnosing MFU degradation. Walk each phase
+with the user: run, interpret, decide. Do not skip phases. Do not take
+state-changing actions without explicit confirmation.
+
+## Prerequisites
+
+| Requirement                                          | Purpose                                                                      |
+| ---------------------------------------------------- | ---------------------------------------------------------------------------- |
+| `hyperpod-ssm` skill                                 | All on-node commands (`get-cluster-info.sh`, `list-nodes.sh`, `ssm-exec.sh`) |
+| `hyperpod-version-checker` skill                     | Detect driver/CUDA/NCCL/EFA/OFI drift across nodes                           |
+| `hyperpod-issue-report` skill                        | Bulk diagnostic collection (recommended)                                     |
+| AWS CLI v2, authenticated for target account/Region  | SSM, EKS access                                                              |
+| `kubectl`                                            | EKS clusters only                                                            |
+| `nvidia-smi`                                         | Preinstalled on HyperPod GPU DLAMIs                                          |
+| `py-spy`                                             | Python stack capture. `pip install py-spy` if missing                        |
+| [`nccl-tests`](https://github.com/NVIDIA/nccl-tests) | Cross-node bandwidth benchmarks                                              |
+| [NVIDIA DCGM](https://github.com/NVIDIA/DCGM)        | `dcgmi` diagnostics, `dcgm-exporter` for monitoring                          |
+| `neuron-top`, `neuron-monitor`                       | Trainium instances (trn1, trn2) in place of `nvidia-smi`                     |
+
+## Defaults
+
+Apply these when the user does not specify:
+
+- **Triage sampling**: 2–3 nodes (suspected slow + healthy baseline).
+- **Fail-slow stack capture**: 5 snapshots at 10-second intervals.
+- **Post-fix benchmark**: ≥100 training steps.
+- **Anomaly threshold**: metric must deviate for several consecutive
+  minutes; 1–2-second spikes are jitter.
+- **State-changing actions**: diagnose only. Never drain, cordon,
+  evict, reboot, or restart without explicit approval. Quote the
+  command and wait.
+- **MFU baseline**: Phase 0 table. Flag < 25% as "seriously wrong."
+- **Region/cluster**: ask, or fall back to `aws configure get region`
+  and state the value used.
+
+## Error Handling
+
+- **SSM access denied / node unreachable**: report the failing target,
+  continue on reachable nodes.
+- **`hyperpod-ssm` not installed**: tell the user. Do not substitute
+  raw `aws ssm send-command` unless asked.
+- **Tool missing on node** (`py-spy`, `dcgmi`, `nccl-tests`): report,
+  offer the install command, wait for approval. Do not auto-install.
+- **Ambiguous orchestrator**: ask. `sinfo` and `kubectl describe node`
+  are not interchangeable.
+- **No MFU baseline**: ask once, then use the Phase 0 table and state
+  which value was applied.
+- **Trainium vs GPU mismatch**: on trn1/trn2 use `neuron-top` and
+  `neuron-monitor`; skip `nvidia-smi`. Flag if context implied GPUs.
+- **Multiple failure modes**: rank by likely impact, work the top
+  item first with approval.
+
+## Examples
+
+**1. Sudden drop.** "Our p5 cluster was at 48% MFU, overnight it fell
+to 22%. Nothing redeployed. Cluster `llama3-70b-prod`, us-west-2."
+→ Phase 0 confirms p5 (H100, 45–55% baseline) and "no code change."
+Phase 1 routes to **2C** (step-function drop). Phase 2 compares
+per-rank step times, sweeps 2–3 nodes, identifies the outlier,
+recommends drain-and-replace after approval.
+
+**2. Consistently low.** "MFU has been ~18% since day one on our p4d
+cluster. Normal?" → Phase 0 flags 18% as below the 40–55% multi-node
+dense baseline. Phase 1 routes to **2G**. Phase 2 captures a
+single-step PyTorch Profiler trace, walks the high-impact checklist
+(Flash Attention, all-reduce overlap, TP placement, BF16).
+
+## Safety and Execution Model
+
+Commands run on a live training cluster via the `hyperpod-ssm` skill,
+which invokes AWS Systems Manager `AWS-RunShellScript` under the
+node's SSM instance profile as root. Every call is logged to
+CloudTrail.
+
+Three command categories:
+
+| Category           | Safe during training | Examples                                                                                     |
+| ------------------ | -------------------- | -------------------------------------------------------------------------------------------- |
+| Read-only          | Yes                  | `nvidia-smi --query-gpu=…`, `dmesg -T`, `fi_info -p efa`, `df -h`, `iostat`, `free -h`       |
+| Process inspection | With care            | `py-spy dump`, `gdb -p` — brief `ptrace` pause; may trip the NCCL watchdog                   |
+| State changing     | No                   | `nvidia-smi -pl`, `-lgc`; `scontrol update state=drain`; `kubectl cordon`; `dcgmi diag -r 4` |
+
+Every state-changing command in the reference files is marked
+**Approval required** with a paired revert. Present and wait for
+explicit confirmation; approval for one step does not carry over.
+
+Sensitive data: training logs can contain dataset paths and
+identifiers — share only relevant lines. Process command lines in
+`nvidia-smi` / `dmesg` can reveal model details; trim before sharing.
+
+Input validation: quote paths, targets, and cluster names when
+interpolating into SSM payloads — they execute as a remote shell.
+
+---
+
+## Accessing Nodes
+
+All on-node work goes through `hyperpod-ssm`. Resolve cluster metadata
+once per session, then use `ssm-exec.sh`:
+
+```bash
+scripts/get-cluster-info.sh <CLUSTER_NAME> --region <REGION>
+scripts/list-nodes.sh        <CLUSTER_NAME> --region <REGION>
+
+scripts/ssm-exec.sh --target "sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>" \
+  '<command>' --region <REGION>
+```
+
+EKS also needs `kubectl`:
+
+```bash
+aws eks update-kubeconfig --name <EKS_CLUSTER_NAME> --region <REGION>
+kubectl get pods -A -o wide
+kubectl describe node <NODE_NAME>
+```
+
+Use `hyperpod-issue-report` for cluster-wide snapshots to S3, and
+`hyperpod-version-checker` to detect driver/CUDA/NCCL/EFA/OFI drift.
+
+---
+
+## Phase 0: Gather Context
+
+Ask only for what is not already in the conversation:
+
+1. **Cluster name/ARN and Region** — required for SSM.
+2. **Orchestrator** — Slurm or EKS.
+3. **Recent changes** — code, config, node replacement, AMI, deps,
+   or none.
+4. **Current and expected MFU**. If none provided, use:
+
+   | Setup                         | Expected MFU    |
+   | ----------------------------- | --------------- |
+   | Single-node dense, well tuned | 55–65%          |
+   | Multi-node dense, well tuned  | 40–55%          |
+   | Multi-node MoE                | 30–45%          |
+   | Trainium (any topology)       | 35–50%          |
+   | Below 25%                     | Seriously wrong |
+
+5. **Pattern** — sudden drop, gradual decline, periodic dips, high
+   variance, or consistently low.
+6. **Instance type** — p4d/p4de (A100), p5 (H100, EFAv2),
+   p5e (H200, EFAv2), p5en (H200, EFAv3/Nitro v5), trn1/trn2.
+7. **Parallelism** — TP, PP, DP sizes; sharding (ZeRO-1/2/3, FSDP).
+8. **Errors** — CUDA, NCCL timeouts, OOM, Xid in logs/`dmesg`.
+
+Then resolve SSM:
+
+```bash
+scripts/get-cluster-info.sh <CLUSTER_NAME> --region <REGION>
+scripts/list-nodes.sh        <CLUSTER_NAME> --region <REGION>
+```
+
+---
+
+## Phase 1: Triage
+
+```
+MFU degradation
+├─ Recent code/config change?              YES → 2A
+├─ Explicit errors (CUDA/NCCL/OOM/Xid)?    YES → 2B
+├─ Pattern?
+│  ├─ Sudden step-function drop            → 2C
+│  ├─ Gradual decline (hours)              → 2D
+│  ├─ Periodic dips                        → 2E
+│  ├─ High step-time variance              → 2F
+│  └─ Consistently low from start          → 2G
+└─ Persists across job restarts?
+   YES → hardware or persistent misconfig
+   NO  → transient; restart with monitoring
+```
+
+Initial sweep on 2–3 nodes (one suspect, one healthy):
+
+```bash
+nvidia-smi
+nvidia-smi --query-gpu=index,temperature.gpu,clocks.current.sm,clocks.max.sm,\
+power.draw,power.limit,pcie.link.width.current,pcie.link.width.max,\
+ecc.errors.uncorrected.volatile.total,ecc.errors.corrected.volatile.total,\
+memory.used,memory.total --format=csv
+dmesg -T | grep -iE 'xid|error|fault|nccl|efa|pcie|ecc' | tail -30
+free -h
+df -h
+fi_info -p efa 2>/dev/null | head -5
+```
+
+Route based on findings:
+
+| Finding                            | Meaning                         | Route |
+| ---------------------------------- | ------------------------------- | ----- |
+| GPU temperature > 88 °C (one node) | Thermal throttle (H100 SW)      | 2D    |
+| SM clock < max (one node)          | Frequency throttle / clock lock | 2C    |
+| Uncorrected ECC > 0                | Memory corruption               | 2B    |
+| PCIe width x8 (max x16)            | PCIe link degradation           | 2C    |
+| Xid in `dmesg`                     | GPU hardware fault              | 2B    |
+| Host swap > 0                      | Host memory pressure            | 2D    |
+| Disk > 90% full                    | Storage bottleneck              | 2E    |
+| EFA absent or wrong count          | Network misconfiguration        | 2F    |
+| All output healthy and identical   | Likely configuration            | 2G    |
+
+Multiple matches: take the higher-severity row first.
+
+---
+
+## Phase 2: Deep Dive
+
+Load only the step file matching the triage result. Each is
+self-contained and footer-links to adjacent paths.
+
+| Route | Problem                          | File                                |
+| ----- | -------------------------------- | ----------------------------------- |
+| 2A    | Code or configuration regression | `references/steps/2a-regression.md` |
+| 2B    | CUDA, NCCL, or OOM errors        | `references/steps/2b-errors.md`     |
+| 2C    | Hardware failure or straggler    | `references/steps/2c-straggler.md`  |
+| 2D    | Thermal, memory, or data         | `references/steps/2d-gradual.md`    |
+| 2E    | Periodic dips                    | `references/steps/2e-periodic.md`   |
+| 2F    | Network variance                 | `references/steps/2f-network.md`    |
+| 2G    | Configuration tuning             | `references/steps/2g-tuning.md`     |
+
+---
+
+## Phase 3: Validate and Monitor
+
+After a fix:
+
+1. **Benchmark** ≥ 100 steps; fewer is noise.
+2. **Verify correctness** — loss trajectory unchanged. A faster
+   divergent run is a regression.
+3. **Monitor** — DCGM Exporter + Prometheus + Grafana for GPU metrics
+   (temperature, clocks, ECC, utilization, power); per-rank step-time
+   logging for stragglers; alert on > 3 σ from a 100-step rolling
+   mean, sustained for several minutes.
+4. **Snapshot** — `hyperpod-issue-report` periodically and after every
+   incident.
+
+---
+
+## MFU Quick Reference
+
+See [references/mfu-quick-ref.md](references/mfu-quick-ref.md) for the MFU formula, instance specs (TFLOPS, EFA bandwidth, NVLink), baseline expectations, and source references.
+
+---
+
+## Reference Files
+
+Load only the file matching the current phase — see the Phase 2 table above.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/failure-patterns.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/failure-patterns.md
new file mode 100644
index 00000000..b04d0133
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/failure-patterns.md
@@ -0,0 +1,263 @@
+# Hardware Failure Patterns
+
+Xid codes, gray failures, failure propagation, silent data corruption.
+Used in Phase 2B (explicit errors) and 2C (hardware straggler).
+
+## Contents
+
+1. [Xid Error Catalog](#1-xid-error-catalog)
+2. [Gray Failures](#2-gray-failures)
+3. [How One Degraded GPU Slows the Job](#3-how-one-degraded-gpu-slows-the-job)
+4. [Silent Data Corruption](#4-silent-data-corruption)
+5. [Failure Frequency at Scale](#5-failure-frequency-at-scale)
+
+---
+
+## 1. Xid Error Catalog
+
+Xid errors are NVIDIA GPU hardware/driver events in `dmesg`.
+
+```bash
+dmesg -T | grep 'NVRM: Xid' | tail -20
+
+# Aggregate by code:
+dmesg -T | grep 'NVRM: Xid' | awk -F'Xid.*: ' '{print $2}' | \
+  sort | uniq -c | sort -rn
+```
+
+Common codes on A100/H100/H200 (source:
+[NVIDIA Xid Error Catalog](https://docs.nvidia.com/deploy/xid-errors/analyzing-xid-catalog.html)):
+
+| Xid | Mnemonic                       | Meaning                                                   | Action                                         |
+| --- | ------------------------------ | --------------------------------------------------------- | ---------------------------------------------- |
+| 13  | Graphics Engine Exception      | Usually application; driver/GSP can also present this way | Restart; treat as HW if it recurs on same node |
+| 31  | GPU memory page fault (MMU)    | Usually illegal access; persistent = HW                   | Restart; if persistent, reset GPU or drain     |
+| 32  | Invalid/corrupted push buffer  | PBDMA fault, often PCIe quality                           | Restart                                        |
+| 43  | GPU stopped processing         | Channel reset verification failure                        | Restart                                        |
+| 45  | Preemptive cleanup             | Secondary to another Xid; usually benign                  | Follow primary Xid                             |
+| 48  | Double-bit ECC error           | Uncorrectable memory error                                | Reset GPU or evict                             |
+| 63  | GPU memory remapping event     | Row remap succeeded; monitor                              | Monitor                                        |
+| 64  | GPU memory remapping failure   | Row remap failed; HBM exhausted                           | Reset GPU or evict                             |
+| 74  | NVLink error                   | Link-level fault                                          | Reset GPU                                      |
+| 79  | GPU has fallen off the bus     | PCIe link down                                            | Restart                                        |
+| 92  | High single-bit ECC error rate | Corrected errors trending high                            | Monitor, plan replacement                      |
+| 94  | Contained memory error         | Application-level containment                             | Restart                                        |
+| 95  | Uncontained memory error       | Multi-application impact                                  | Reset GPU                                      |
+| 119 | GSP RPC timeout                | GSP unresponsive                                          | Reset GPU                                      |
+| 120 | GSP error                      | GSP firmware fault                                        | Reset GPU                                      |
+
+Immediate eviction warranted: **48, 64, 79, 95**. Xid 31 and 94 are
+typically application bugs unless recurring on the same node. Xid 63
+alone is informational; escalate if Xid 64 follows.
+
+### ECC counts
+
+```bash
+nvidia-smi --query-gpu=index,ecc.errors.corrected.volatile.total,\
+ecc.errors.uncorrected.volatile.total --format=csv
+```
+
+- Uncorrected > 0 → evict.
+- Corrected rising fast (minutes/hours) → plan replacement.
+
+### Row-remap state
+
+```bash
+nvidia-smi --query-remapped-rows=gpu_bus_id,\
+remapped_rows.correctable,remapped_rows.uncorrectable,\
+remapped_rows.pending,remapped_rows.failure --format=csv
+```
+
+`remapped_rows.failure > 0` → spare-row pool exhausted, retire GPU.
+
+---
+
+## 2. Gray Failures
+
+The GPU appears working but is degraded, silently slowing the job.
+
+### Clock mismatch
+
+```bash
+nvidia-smi --query-gpu=index,clocks.current.sm,clocks.max.sm --format=csv
+```
+
+One GPU at 1,200 MHz while peers report 1,800 MHz → throttled or
+clock-locked. Common causes:
+
+- Thermal throttle — check temperature.
+- Power cap hit — compare `power.draw` vs `power.limit`.
+- Stale frequency lock from a previous diagnostic (e.g. NVIDIA EUD).
+- Driver bug.
+
+**Approval required** before a clock-lock workaround:
+`nvidia-smi -lgc <min>,<max>` changes runtime state and reduces
+throughput until reverted. Revert: `nvidia-smi -rgc`. Address the
+underlying cause rather than leaving a lock in place.
+
+### PCIe link degradation
+
+```bash
+nvidia-smi --query-gpu=index,pcie.link.width.current,\
+pcie.link.width.max,pcie.link.gen.current,pcie.link.gen.max \
+  --format=csv
+```
+
+Gen x8 when x16 is expected → half the NIC/host bandwidth; throttles
+data transfers and can cascade as network congestion.
+
+### Rising correctable ECC
+
+Correction adds latency. A GPU with rapidly rising correctable counts
+is slower than peers and will eventually fail uncorrectably.
+
+```bash
+watch -n 10 'nvidia-smi --query-gpu=index,\
+ecc.errors.corrected.volatile.total --format=csv'
+```
+
+---
+
+## 3. How One Degraded GPU Slows the Job
+
+Synchronized collectives (all-reduce, all-gather, reduce-scatter)
+require every rank to finish before any rank proceeds.
+
+```
+Healthy:
+  GPU 0: compute 10 ms → all-reduce 5 ms → next step = 15 ms
+  GPU 1: compute 10 ms → all-reduce 5 ms → next step = 15 ms
+
+One degraded GPU (3.6× slower):
+  GPU 0: compute 10 ms → WAIT 40 ms  → all-reduce 5 ms = 55 ms
+  GPU 1: compute 50 ms ─────────────→ all-reduce 5 ms = 55 ms
+```
+
+Straggler detection is the highest-leverage MFU debugging skill.
+
+---
+
+## 4. Silent Data Corruption
+
+A faulty GPU produces incorrect results without raising an error.
+Corrupted values propagate through collectives to every rank.
+
+### Spread
+
+1. Faulty GPU computes an incorrect gradient.
+2. All-reduce mixes the corrupted gradient into every rank.
+3. All ranks apply a wrong update.
+4. Error compounds across steps.
+5. Eventually manifests as NaN loss or divergent training.
+6. The source GPU is usually unidentifiable from training metrics
+   alone by the time corruption is noticed.
+
+### Signals
+
+- NaN/Inf loss with no logged errors.
+- Loss diverging from expected trajectory.
+- Gradient-norm spikes without obvious cause.
+- Bitwise mismatch between replicated computations.
+
+### Proactive detection
+
+```bash
+# DCGM level 4 (most thorough; 8+ hours). Drain first:
+#   Slurm: scontrol update nodename=<node> state=drain
+#   EKS:   kubectl cordon <node> && kubectl drain <node> --ignore-daemonsets
+# Revert: state=resume or kubectl uncordon.
+dcgmi diag -r 4
+
+nvidia-smi --query-gpu=index,ecc.errors.corrected.volatile.total --format=csv
+```
+
+### Isolation when source is unknown
+
+1. Split the job across disjoint node subsets.
+2. Run each for several hundred steps.
+3. The subset producing incorrect results contains the faulty node.
+4. Binary-search within the subset.
+5. Drain the node and run DCGM -r 4 offline.
+
+### Recovery
+
+- Roll back before corruption started. Corruption may predate the
+  first NaN; inspect the loss trajectory for the first divergence.
+
+---
+
+## 5. Failure Frequency at Scale
+
+At thousands of GPUs, hardware failures are continuous rather than
+rare.
+
+The public reference is Meta's Llama 3 paper: a 54-day pre-training
+snapshot on 16,384 H100 GPUs, 466 interruptions, 419 unexpected
+(Table 5, Grattafiori et al. 2024). ~78% of unexpected interruptions
+were confirmed or suspected hardware:
+
+| Category                   | Share of unexpected interruptions |
+| -------------------------- | --------------------------------- |
+| Faulty GPUs                | 30.1%                             |
+| GPU HBM3 memory            | 17.2%                             |
+| Software bugs              | 12.9%                             |
+| Network switch/cable       | 8.4%                              |
+| Unplanned host maintenance | 7.6%                              |
+
+These are Meta's cluster and software stack — illustrative, not a
+HyperPod expectation. Real rates vary by instance family, cluster
+size, job duration, and operational tooling.
+
+Averaged to ~1 unexpected interruption every 3 hours on 16K GPUs.
+Meta reported > 90% effective training time despite this.
+
+Operator implications at similar scale:
+
+- Automate detection; manual diagnosis costs tens of minutes to hours.
+- Checkpoint frequently; every minute of recompute is lost MFU.
+- Keep pre-validated spare nodes ready for fast replacement.
+- Simple automated responses (restart, evict-and-replace) resolve
+  most failures faster than human investigation.
+- A single throttled or slow GPU can reduce throughput several-fold
+  (§3).
+
+---
+
+## References
+
+**Xid (§1):**
+
+- NVIDIA Xid Error Catalog:
+  https://docs.nvidia.com/deploy/xid-errors/analyzing-xid-catalog.html
+- A100 Memory Error Management (Xid 63/64 details):
+  https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html
+- `nvidia-smi` reference (ECC, remapped-rows fields):
+  https://docs.nvidia.com/deploy/nvidia-smi/index.html
+- AWS re:Post on Xid troubleshooting:
+  https://repost.aws/knowledge-center/ec2-linux-troubleshoot-xid-errors
+
+**Gray failures / stragglers (§2, §3):**
+
+- "Impact of GPU Thermal Throttling on LLM Training" (one throttled
+  GPU can reduce throughput ~5×):
+  https://jingchaozhang.github.io/Thermal-Throttling-Impact-on-Multi-Node-Training/
+- Llama 3 §5.1 on stragglers (Grattafiori et al. 2024):
+  https://ai.meta.com/research/publications/the-llama-3-herd-of-models/
+
+**SDC (§4):**
+
+- "Silent Data Corruptions at Scale" (Meta, SIGMETRICS 2021):
+  https://arxiv.org/abs/2102.11245
+- Google SDC research: https://research.google/pubs/pub50906/
+- DCGM diagnostic levels:
+  https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html
+
+**Failure rates (§5):**
+
+- Grattafiori et al., "The Llama 3 Herd of Models" (Meta 2024),
+  Table 5:
+  https://ai.meta.com/research/publications/the-llama-3-herd-of-models/
+- ByteDance ByteRobust (200K-GPU reliability):
+  https://arxiv.org/abs/2509.16293
+- MegaScale 10K+ GPU reliability (Jiang et al., NSDI 2024):
+  https://www.usenix.org/conference/nsdi24/presentation/jiang-ziheng
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/mfu-quick-ref.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/mfu-quick-ref.md
new file mode 100644
index 00000000..7b9f1040
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/mfu-quick-ref.md
@@ -0,0 +1,67 @@
+# MFU Quick Reference
+
+```
+MFU = (tokens_per_sec × 6 × num_params) / (num_accelerators × peak_tflops)
+```
+
+`6N` = forward (2N) + backward (4N). For Trainium, substitute
+`num_chips` and per-chip BF16 TFLOPS.
+
+| Instance      | Accelerator | Per node | BF16 TFLOPS/chip | EFA                |
+| ------------- | ----------- | -------- | ---------------- | ------------------ |
+| p4d.24xlarge  | A100 40 GB  | 8        | 312              | 400 Gbps (4×100)   |
+| p4de.24xlarge | A100 80 GB  | 8        | 312              | 400 Gbps (4×100)   |
+| p5.48xlarge   | H100 80 GB  | 8        | 989              | 3,200 Gbps (EFAv2) |
+| p5e.48xlarge  | H200 141 GB | 8        | 989              | 3,200 Gbps (EFAv2) |
+| p5en.48xlarge | H200 141 GB | 8        | 989              | 3,200 Gbps (EFAv3) |
+| trn1.32xlarge | Trainium    | 16       | 190              | 800 Gbps           |
+| trn2.48xlarge | Trainium2   | 16       | 667              | 3,200 Gbps         |
+
+H100 and H200 have identical dense compute; H200 differs in HBM
+(141 GB HBM3e @ 4.8 TB/s vs 80 GB HBM3 @ 3.35 TB/s). p5en EFAv3 on
+Nitro v5 reduces collective latency up to 35% vs p5/p5e.
+
+NVLink: 900 GB/s per GPU (H100/H200; 3.6 TB/s bisectional across the
+8-GPU NVSwitch fabric); 600 GB/s per GPU (A100).
+
+Use `neuron-top` / `neuron-monitor` on Trainium. Formula unchanged.
+
+---
+
+## References
+
+**Instance specifications:**
+
+- EC2 P5 family (H100/H200, 3,200 Gbps EFA):
+  https://aws.amazon.com/ec2/instance-types/p5/
+- p5en EFAv3 on Nitro v5 (up to 35% lower latency):
+  https://aws.amazon.com/blogs/aws/new-amazon-ec2-p5en-instances-with-nvidia-h200-tensor-core-gpus-and-efav3-networking/
+- H100/H200 BF16 Tensor Core TFLOPS (989 dense, 1,979 sparse):
+  https://docs.nvidia.com/dgx/dgxh100-user-guide/introduction-to-dgxh100.html
+- A100 BF16 Tensor Core (312 dense):
+  https://developer.nvidia.com/blog/nvidia-ampere-architecture-in-depth/
+- Trainium2 per-chip (667 BF16/FP16/TF32, 1,299 FP8; 16 chips):
+  https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trainium2.html
+- Trainium v1 per-chip (190 BF16/FP16; 16 chips; 800 Gbps EFA):
+  https://aws.amazon.com/machine-learning/trainium/,
+  https://aws.amazon.com/about-aws/whats-new/2022/10/ec2-trn1-instances-high-performance-cost-effective-deep-learning-training/
+- EC2 P4 family (A100, 400 Gbps EFA):
+  https://aws.amazon.com/ec2/instance-types/p4/
+- NVLink 4 on H100/H200 (900 GB/s per GPU, 3.6 TB/s bisectional):
+  https://docs.nvidia.com/dgx/dgxh100-user-guide/introduction-to-dgxh100.html
+- NVLink 3 on A100 (600 GB/s per GPU):
+  https://www.nvidia.com/en-us/data-center/nvlink/
+
+**MFU formula and baselines:**
+
+- `6N` approximation (Kaplan et al. 2020; Chowdhery et al. 2022 / PaLM):
+  https://arxiv.org/abs/2001.08361, https://arxiv.org/abs/2204.02311
+- MegaScale 55.2% MFU at 175B / 12,288 GPUs (Jiang et al., NSDI 2024):
+  https://www.usenix.org/conference/nsdi24/presentation/jiang-ziheng
+- PyTorch FSDP 57% MFU at 7B / 512 GPUs:
+  https://pytorch.org/blog/maximizing-training/
+
+**Monitoring:**
+
+- DCGM and dcgm-exporter:
+  https://github.com/NVIDIA/DCGM, https://github.com/NVIDIA/dcgm-exporter
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/network-and-comms.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/network-and-comms.md
new file mode 100644
index 00000000..113adcd4
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/network-and-comms.md
@@ -0,0 +1,349 @@
+# Network and Communication Diagnostics
+
+EFA, NCCL, stack-trace straggler detection, cross-node bandwidth. All
+on-node commands run through `hyperpod-ssm`.
+
+## Contents
+
+1. [EFA Health](#1-efa-health)
+2. [NCCL Configuration](#2-nccl-configuration)
+3. [Cross-Node Bandwidth](#3-cross-node-bandwidth)
+4. [Straggler Detection via Stack Traces](#4-straggler-detection)
+5. [Common HyperPod Network Issues](#5-common-hyperpod-network-issues)
+
+---
+
+## 1. EFA Health
+
+### Present and functional
+
+```bash
+fi_info -p efa
+# Expected device count:
+#   p5 / p5e / p5en:  32
+#   p4d / p4de:        4
+#   trn1:              8
+#   trn2:             16
+
+lsmod | grep efa                           # kernel module
+cat /sys/class/infiniband/*/ports/1/state  # each entry should be "4: ACTIVE"
+```
+
+Note: `fi_info -p efa` can return multiple endpoints per physical
+card. For a quick card count, compare against the expected physical
+count with `ls -d /sys/class/infiniband/*`.
+
+### Error counters
+
+```bash
+for dev in /sys/class/infiniband/*/; do
+  name=$(basename "$dev")
+  rcv_err=$(cat "$dev/ports/1/counters/port_rcv_errors" 2>/dev/null)
+  xmit_disc=$(cat "$dev/ports/1/counters/port_xmit_discards" 2>/dev/null)
+  if [ "$rcv_err" != "0" ] || [ "$xmit_disc" != "0" ]; then
+    echo "PROBLEM: $name rcv_errors=$rcv_err xmit_discards=$xmit_disc"
+  fi
+done
+```
+
+Non-zero counters = packet loss or link issues.
+
+### Firmware version
+
+```bash
+cat /sys/class/infiniband/*/fw_ver 2>/dev/null
+```
+
+All nodes should match. Mismatches after replacement cause subtle
+performance differences.
+
+---
+
+## 2. NCCL Configuration
+
+### Confirm EFA, not TCP
+
+Look in training logs:
+
+```
+NCCL INFO NET/OFI Using provider: efa
+```
+
+`NET/Socket` = TCP fallback; cross-node bandwidth drops drastically.
+To debug, before job start:
+
+```bash
+export NCCL_DEBUG=INFO
+export NCCL_DEBUG_SUBSYS=INIT,NET
+```
+
+### Recommended environment (p5 / p5e / p5en)
+
+These must be set in the launcher before the training process starts;
+mid-run changes have no effect. Treat as a starting point and benchmark
+with [nccl-tests](https://github.com/NVIDIA/nccl-tests) before
+adopting as a standard.
+
+```bash
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1
+export FI_EFA_FORK_SAFE=1
+export NCCL_PROTO=Simple
+export NCCL_ALGO=Ring                 # or Tree; benchmark both
+export NCCL_NET_GDR_LEVEL=SYS         # GPU Direct RDMA
+export NCCL_NET_GDR_READ=1
+export NCCL_BUFFSIZE=8388608          # modern NCCL often auto-tunes well; measure before overriding
+export NCCL_P2P_LEVEL=NVL
+export NCCL_CROSS_NIC=1
+```
+
+Behavior depends on NCCL version. Check with
+`python -c "import torch; print(torch.cuda.nccl.version())"` or the
+`NCCL version` line in the training log.
+
+### Recommended environment (p4d / p4de)
+
+```bash
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1
+export FI_EFA_FORK_SAFE=1
+export NCCL_PROTO=Simple
+export NCCL_ALGO=Ring
+```
+
+### Algorithm selection
+
+| `NCCL_ALGO` | Best for                       | Rationale                          |
+| ----------- | ------------------------------ | ---------------------------------- |
+| Ring        | Small messages, small clusters | Lower latency                      |
+| Tree        | Large messages, large clusters | Better bandwidth via hierarchical  |
+| (unset)     | Most workloads                 | NCCL auto-selects per message size |
+
+Benchmark both with nccl-tests for the specific workload.
+
+---
+
+## 3. Cross-Node Bandwidth
+
+### nccl-tests
+
+Build once:
+
+```bash
+git clone https://github.com/NVIDIA/nccl-tests.git
+cd nccl-tests
+make MPI=1 NCCL_HOME=/opt/amazon/nccl CUDA_HOME=/usr/local/cuda
+```
+
+Run across two nodes:
+
+```bash
+# Slurm
+sbatch --nodes=2 --ntasks-per-node=8 --gpus-per-node=8 --wrap \
+  "./build/all_reduce_perf -b 1M -e 4G -f 2 -g 1"
+
+# EKS: MPI Operator (kubeflow/mpi-operator) or Volcano + mpirun.
+# nccl-tests ships in nvcr.io/nvidia/pytorch:<tag>.
+```
+
+Inspect `busbw` at ≥ 1 GB messages:
+
+| Instance        | Expected 2-node `busbw` |
+| --------------- | ----------------------- |
+| p5 / p5e / p5en | 380–420 GB/s            |
+| p4d / p4de      | 35–45 GB/s              |
+
+Significantly below expected: EFA misconfiguration, missing
+self-referencing security-group rule, no placement group, EFAv3 driver
+drift on p5en, or hardware fault.
+
+### Point-to-point
+
+```bash
+# node A
+fi_pingpong -p efa -e rdm
+# node B
+fi_pingpong -p efa -e rdm <node-A-private-IP>
+```
+
+---
+
+## 4. Straggler Detection
+
+Training is slow or stuck with no error messages → stack-trace
+aggregation identifies the outlier.
+
+### Principle
+
+In synchronized training, healthy nodes execute the same code path at
+roughly the same time. A node stuck elsewhere (collective wait, I/O,
+slow kernel) stands out.
+
+### Capture stacks simultaneously
+
+[py-spy](https://github.com/benfred/py-spy) is a sampling profiler
+that attaches to a running Python process (`pip install py-spy`).
+
+Caveats before running on a live job:
+
+- `py-spy dump` uses `ptrace` to briefly pause the target. Under load
+  the pause can stretch to several seconds and may trip the PyTorch
+  NCCL watchdog (`TORCH_NCCL_*_TIMEOUT_SEC`), aborting the job.
+  Coordinate with the owner.
+- `ptrace` requires `CAP_SYS_PTRACE` or a matching UID. With
+  `/proc/sys/kernel/yama/ptrace_scope=1` only the same user can
+  attach; with `2`, only root can.
+- Do not suppress errors on the first attempt; surface missing tools
+  and permission issues.
+
+```bash
+for pid in $(pgrep -f 'python.*train'); do
+  echo "=== PID $pid ==="
+  py-spy dump --pid "$pid" --format raw
+done
+```
+
+For NCCL-level tracing (requires job restart):
+
+```bash
+export TORCH_NCCL_ENABLE_MONITORING=1
+```
+
+### Group by similarity
+
+- **Majority pattern**: healthy baseline.
+- **Minority pattern** (1–3 nodes): suspect.
+
+### Identify the root cause
+
+Not every outlier is the cause. In pipeline parallelism a stuck node
+blocks its PP group; blocked nodes show passive waits (`irecv`). The
+active outlier (slow kernel, stalled I/O) is the source.
+
+### Fail-slow
+
+When all nodes progress but one is slow, a single snapshot is
+inconclusive. Take 5 snapshots at 10-second intervals. The node
+appearing in the minority most often is degraded.
+
+### Common patterns
+
+| Stack signature                                      | Meaning                                 |
+| ---------------------------------------------------- | --------------------------------------- |
+| `ncclKernel_AllReduce → ncclProxyProgress → blocked` | NCCL waiting on network — NIC or switch |
+| `isend/irecv → ProcessGroupNCCL → blocked`           | Pipeline P2P stuck                      |
+| `DataLoader.__next__ → Queue.get → blocked`          | Data loading stalled                    |
+| `torch.save → write → blocked`                       | Synchronous checkpoint blocking on disk |
+| `aten::matmul → cudaStreamSynchronize → blocked`     | GPU kernel not completing — hardware    |
+
+### GDB for native hangs
+
+```bash
+gdb -p <pid> -batch -ex "thread apply all bt"
+```
+
+Reveals CUDA driver and NCCL library state. Same precautions as
+`py-spy`: `gdb -p` uses `PTRACE_ATTACH` and stops the target for the
+duration of the command. On `kernel.yama.ptrace_scope=1` only same-UID
+attach; with `2`, only root.
+
+---
+
+## 5. Common HyperPod Network Issues
+
+### EFA security-group rule missing
+
+EFA requires a self-referencing rule permitting all traffic between
+nodes in the same group. Without it, NCCL falls back to TCP.
+
+**Check**: inbound and outbound rules with protocol `All` and
+source/destination = the security group itself.
+
+### Placement group missing
+
+All training nodes must share a single cluster placement group for
+low-latency networking.
+
+**Check**: verify in the HyperPod cluster configuration.
+
+### OFI NCCL plugin absent after replacement
+
+A replaced node may be missing the AWS OFI NCCL plugin if the
+lifecycle script failed to install it.
+
+```bash
+# Current HyperPod AMIs:
+ls /opt/amazon/ofi-nccl/lib/libnccl-net.so 2>/dev/null || echo "MISSING"
+
+# Legacy layout:
+ls /opt/aws-ofi-nccl/lib/libnccl-net.so 2>/dev/null
+ls /opt/amazon/efa/lib64/libnccl-net.so 2>/dev/null
+```
+
+Per the [HyperPod EKS AMI release notes](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-release-ami-eks.html),
+the EFA installer now includes OFI NCCL under `/opt/amazon/ofi-nccl/`
+rather than the legacy `/opt/aws-ofi-nccl/`.
+
+**Fix**: ensure lifecycle scripts install the EFA installer and the
+NCCL plugin. Compare with `hyperpod-version-checker`.
+
+### `/dev/shm` too small for NCCL
+
+```bash
+df -h /dev/shm       # minimum 16 GB; 64 GB+ recommended
+```
+
+- Slurm: `#SBATCH --mem=0`.
+- EKS: `--shm-size=64g` in the pod spec, or `emptyDir` at `/dev/shm`
+  with `medium: Memory`.
+
+---
+
+## References
+
+**EFA / libfabric (§1):**
+
+- EFA user guide:
+  https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html
+- P5 EFA (3,200 Gbps):
+  https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/p5-efa.html
+- libfabric `fi_info`:
+  https://ofiwg.github.io/libfabric/main/man/fi_info.1.html
+- InfiniBand port counters:
+  https://docs.nvidia.com/networking/display/ufmsdnappumv4184/infiniband+port+counters
+
+**NCCL and NCCL-EFA (§2):**
+
+- NCCL environment variables:
+  https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+- AWS libfabric EFA variables:
+  https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl.html
+- `aws-ofi-nccl`: https://github.com/aws/aws-ofi-nccl
+- HyperPod AMI release notes (OFI NCCL path):
+  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-release-ami-eks.html
+
+**Bandwidth (§3):**
+
+- nccl-tests: https://github.com/NVIDIA/nccl-tests
+- HyperPod performance testing:
+  https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/slurm-orchestration/validation-and-testing/performance-testing/nccl-tests
+- Kubeflow MPI Operator: https://github.com/kubeflow/mpi-operator
+- `fi_pingpong`:
+  https://ofiwg.github.io/libfabric/main/man/fi_pingpong.1.html
+
+**Stragglers (§4):**
+
+- py-spy: https://github.com/benfred/py-spy
+- `TORCH_NCCL_ENABLE_MONITORING`:
+  https://docs.pytorch.org/docs/stable/torch_nccl_environment_variables.html
+- Llama 3 §5.1 on straggler detection (Grattafiori et al. 2024):
+  https://ai.meta.com/research/publications/the-llama-3-herd-of-models/
+
+**HyperPod networking (§5):**
+
+- EFA security-group requirements:
+  https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html
+- EC2 placement groups:
+  https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html
+- Kubernetes `emptyDir` with `medium: Memory`:
+  https://kubernetes.io/docs/concepts/storage/volumes/#emptydir
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/node-diagnostics.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/node-diagnostics.md
new file mode 100644
index 00000000..2c62c4df
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/node-diagnostics.md
@@ -0,0 +1,353 @@
+# Node-Level Diagnostics
+
+Per-node health via `hyperpod-ssm` `ssm-exec.sh`. Each section lists
+the command, expected output, and problem conditions.
+
+## Contents
+
+1. [GPU Health](#1-gpu-health)
+2. [Thermal](#2-thermal)
+3. [Memory Pressure](#3-memory-pressure)
+4. [Data Pipeline and Storage](#4-data-pipeline-and-storage)
+5. [System Checks](#5-system-checks)
+
+---
+
+## Running Across Multiple Nodes
+
+Most diagnoses require cross-node comparison. Use
+`hyperpod-issue-report` for automated collection, or loop manually:
+
+```bash
+scripts/list-nodes.sh <CLUSTER_NAME> --region <REGION>
+
+for target in <TARGET_1> <TARGET_2> <TARGET_3>; do
+  echo "=== $target ==="
+  scripts/ssm-exec.sh --target "$target" \
+    'nvidia-smi --query-gpu=index,clocks.current.sm,temperature.gpu --format=csv' \
+    --region <REGION>
+done
+```
+
+The node whose output differs from the majority is the suspect.
+
+---
+
+## 1. GPU Health
+
+### Status
+
+```bash
+nvidia-smi
+```
+
+Inspect utilization, memory, temperature, and processes.
+
+### Per-GPU metrics
+
+```bash
+nvidia-smi --query-gpu=index,name,temperature.gpu,\
+clocks.current.sm,clocks.max.sm,power.draw,power.limit,\
+pcie.link.width.current,pcie.link.width.max,\
+ecc.errors.uncorrected.volatile.total,ecc.errors.corrected.volatile.total,\
+memory.used,memory.total --format=csv
+```
+
+| Condition                              | Meaning                                       |
+| -------------------------------------- | --------------------------------------------- |
+| `clocks.current.sm` ≪ `clocks.max.sm`  | Throttled (thermal, power cap, or clock lock) |
+| `ecc.errors.uncorrected > 0`           | Memory corruption — evict                     |
+| `ecc.errors.corrected` rising fast     | HBM degrading; failure imminent               |
+| `pcie.link.width.current = 8` (max 16) | Halved PCIe bandwidth                         |
+
+### Throttle reasons
+
+```bash
+nvidia-smi --query-gpu=index,\
+clocks_event_reasons.hw_slowdown,\
+clocks_event_reasons.hw_thermal_slowdown,\
+clocks_event_reasons.sw_thermal_slowdown,\
+clocks_event_reasons.sw_power_cap --format=csv
+```
+
+Any `Active` value = active throttling.
+
+### HBM row remapping
+
+```bash
+nvidia-smi --query-remapped-rows=gpu_bus_id,\
+remapped_rows.correctable,remapped_rows.uncorrectable,\
+remapped_rows.pending,remapped_rows.failure --format=csv
+```
+
+`remapped_rows.failure > 0`: remap pool exhausted — retire the GPU.
+
+### NVLink
+
+```bash
+nvidia-smi nvlink -s   # throughput
+nvidia-smi nvlink -e   # errors (non-zero = degraded link)
+```
+
+### DCGM (when installed)
+
+```bash
+dcgmi discovery -l
+dcgmi health -c        # safe during training
+dcgmi diag -r 3        # ~10 minutes
+# dcgmi diag -r 4      # 8+ hours, SDC detection
+```
+
+**Prerequisite for `-r 3` and `-r 4`**: drain first. These levels hold
+the GPU and conflict with training.
+
+- Slurm: `scontrol update nodename=<node> state=drain` — revert with
+  `state=resume`.
+- EKS: `kubectl cordon <node> && kubectl drain <node> --ignore-daemonsets`
+  — revert with `kubectl uncordon`.
+
+For monitoring, deploy
+[dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) into
+Prometheus/Grafana.
+
+### Topology
+
+```bash
+nvidia-smi topo -m
+```
+
+Verify TP groups are placed on NVLink-connected GPUs.
+
+---
+
+## 2. Thermal
+
+### Collect across nodes
+
+```bash
+nvidia-smi --query-gpu=index,temperature.gpu,clocks.current.sm \
+  --format=csv,noheader
+```
+
+### Thresholds (H100 SXM5, from NVIDIA DGX H100 User Guide)
+
+| Threshold                     | H100 SXM5 | Behavior                           |
+| ----------------------------- | --------- | ---------------------------------- |
+| GPU Target Temperature        | 83 °C     | Fan target; no clock impact        |
+| GPU Max Operating Temperature | 88 °C     | Software throttle (reduced clocks) |
+| GPU Slowdown Temperature      | 92 °C     | Hardware throttle                  |
+| GPU Shutdown Temperature      | ~105 °C   | GPU shuts down                     |
+
+Operational ranges:
+
+| Temperature | Status     | Impact                                 |
+| ----------- | ---------- | -------------------------------------- |
+| < 75 °C     | Normal     | None                                   |
+| 75–83 °C    | Warm       | Monitor                                |
+| 83–88 °C    | Hot        | Inspect for cooling drift              |
+| 88–92 °C    | Throttling | Software throttle; node is a straggler |
+| > 92 °C     | Critical   | Hardware throttle; shutdown risk       |
+
+Confirm thermal throttling is causing MFU loss by correlating the hot
+node's per-rank step time against healthy peers.
+
+### Remediation
+
+The first two options change runtime state and reduce throughput on
+the affected node until reverted. Present and wait for approval.
+
+| Timeframe  | Action                                                                                                                                           |
+| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
+| Short-term | **Approval required.** `nvidia-smi -pl <watts>` caps power. Default from `nvidia-smi -q -d POWER`; revert with `nvidia-smi -pl <default_watts>`. |
+| Short-term | **Approval required.** `nvidia-smi -lgc <min>,<max>` locks SM clocks. Revert: `nvidia-smi -rgc`.                                                 |
+| Short-term | Investigate rack cooling at the node's location.                                                                                                 |
+| Long-term  | Drain and replace chronically hot nodes.                                                                                                         |
+
+In synchronized training, a capped GPU paces the job — overall
+throughput tracks the capped node until reverted.
+
+---
+
+## 3. Memory Pressure
+
+### GPU memory
+
+```bash
+nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv
+```
+
+Usage > 95% of total → fragmentation and OOM risk.
+
+### Fragmentation (in training code)
+
+```python
+stats = torch.cuda.memory_stats()
+active = stats['active_bytes.all.peak']
+reserved = stats['reserved_bytes.all.peak']
+print(f"Active: {active/1e9:.1f} GB, Reserved: {reserved/1e9:.1f} GB")
+print(f"Fragmentation: {1 - active/reserved:.1%}")
+# > 20% is significant.
+```
+
+### Host memory
+
+```bash
+free -h             # any swap during training is a problem
+vmstat 1 5          # si/so > 0 = active swapping
+```
+
+### Remediation
+
+| Problem                 | Action                                                      |
+| ----------------------- | ----------------------------------------------------------- |
+| GPU near capacity       | Reduce micro-batch; enable selective gradient checkpointing |
+| GPU fragmentation > 20% | See note below                                              |
+| Host swap active        | Reduce DataLoader workers; check preprocessing for leaks    |
+| Host OOM                | Verify optimizer-state sharding is active                   |
+
+`torch.cuda.empty_cache()` is a code change, not a diagnostic. It
+releases cached memory to the driver; in some allocator configurations
+this increases pressure and triggers an OOM on the next step. For
+fragmentation, prefer `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`
+and reduce peak allocation patterns. Validate any change off a
+production workload.
+
+---
+
+## 4. Data Pipeline and Storage
+
+GPUs idle waiting for data → periodic dips or consistently low MFU.
+
+### Disk and filesystem
+
+```bash
+df -h                                     # check root and data volumes
+lfs df -h /fsx 2>/dev/null                # FSx for Lustre
+lctl get_param llite.*.stats 2>/dev/null | head -30
+```
+
+> 90% full: checkpoint writes fail; loading may stall.
+
+### Disk I/O
+
+```bash
+iostat -x 1 5
+# Indicators: await > 10 ms, %util > 80, r/s or w/s flat at device max
+```
+
+### DataLoader throughput
+
+Temporary in the training script:
+
+```python
+import time
+for i, batch in enumerate(dataloader):
+    if i == 0:
+        start = time.time()
+    if i == 100:
+        print(f"DataLoader: {100/(time.time()-start):.1f} batches/sec")
+        break
+```
+
+Throughput < 1.5× the training step rate → data loading is the
+bottleneck.
+
+### Remediation
+
+| Problem                 | Action                                                                              |
+| ----------------------- | ----------------------------------------------------------------------------------- |
+| Slow shared filesystem  | Stage data to local NVMe                                                            |
+| Disk full               | Remove old checkpoints; grow the volume                                             |
+| DataLoader too slow     | Raise `num_workers` (4–8 per GPU); set `pin_memory=True`, `persistent_workers=True` |
+| Tokenization bottleneck | Pre-tokenize offline                                                                |
+
+---
+
+## 5. System Checks
+
+### Processes
+
+```bash
+ps aux --sort=-%cpu | head -20
+```
+
+### Kernel messages
+
+```bash
+dmesg -T | tail -50
+dmesg -T | grep -ciE 'error|fault|xid|efa|pcie|nvlink|ecc'
+```
+
+Non-zero count warrants investigation.
+
+### Slurm node state
+
+```bash
+sinfo -N -l | grep <node>
+scontrol show node <node>
+# Inspect State, Reason, RealMemory, AllocTRES
+```
+
+### EKS node state
+
+```bash
+kubectl describe node <node-name>
+# Conditions: MemoryPressure, DiskPressure, PIDPressure; compare Allocated vs Capacity
+kubectl get pods -A --field-selector spec.nodeName=<node-name>
+```
+
+---
+
+## References
+
+**`nvidia-smi` and NVML (§1, §2, §3):**
+
+- `nvidia-smi` user guide: https://docs.nvidia.com/deploy/nvidia-smi/index.html
+- NVML throttle-reasons enumeration:
+  https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksThrottleReasons.html
+
+**DCGM (§1):**
+
+- DCGM: https://github.com/NVIDIA/DCGM
+- Diagnostic levels:
+  https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html
+- dcgm-exporter: https://github.com/NVIDIA/dcgm-exporter
+
+**Thermal thresholds (§2):**
+
+- NVIDIA DGX H100/H200 power-capping and thresholds:
+  https://docs.nvidia.com/dgx/dgxh100-user-guide/power-capping.html
+- NVIDIA developer forum on `GPU Target / Max Operating / Slowdown`:
+  https://forums.developer.nvidia.com/t/nvidia-smi-gpu-target-temperature-maximum-operating-temperature/229325
+- Hopper architecture deep dive:
+  https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/
+
+**HBM row remapping (§1):**
+
+- A100 GPU Memory Error Management Guide:
+  https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html
+
+**PyTorch memory (§3):**
+
+- `torch.cuda.memory_stats()`:
+  https://docs.pytorch.org/docs/stable/generated/torch.cuda.memory.memory_stats.html
+- CUDA semantics and allocator:
+  https://docs.pytorch.org/docs/stable/notes/cuda.html
+
+**FSx for Lustre (§4):**
+
+- FSx mount on HyperPod:
+  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-lifecycle-best-practices-slurm-slurm-setup-with-fsx.html
+- FSx for Lustre best practices:
+  https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/common/tips/Fsx%20for%20Lustre%20best%20practices
+- Lustre client statistics:
+  https://doc.lustre.org/lustre_manual.xhtml
+
+**System tools (§5):**
+
+- `iostat(1)`: https://man7.org/linux/man-pages/man1/iostat.1.html
+- `mpstat(1)`, `vmstat(8)`: https://man7.org/linux/man-pages/man1/mpstat.1.html
+- Slurm `sinfo`, `scontrol`:
+  https://slurm.schedmd.com/sinfo.html, https://slurm.schedmd.com/scontrol.html
+- Kubernetes node conditions:
+  https://kubernetes.io/docs/concepts/architecture/nodes/#condition
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/optimization-guide.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/optimization-guide.md
new file mode 100644
index 00000000..03afe65c
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/optimization-guide.md
@@ -0,0 +1,362 @@
+# MFU Optimization Guide
+
+How to improve MFU when the root cause is configuration, not hardware.
+Organized by expected impact; start from the top.
+
+## Contents
+
+1. [High-Impact Checklist](#1-high-impact-checklist)
+2. [Profiling](#2-profiling)
+3. [Parallelism Tuning](#3-parallelism-tuning)
+4. [Computation-Communication Overlap](#4-computation-communication-overlap)
+5. [Kernel and Precision](#5-kernel-and-precision)
+6. [Memory for Larger Batches](#6-memory-for-larger-batches)
+7. [NCCL Tuning](#7-nccl-tuning)
+8. [Infrastructure Placement](#8-infrastructure-placement)
+
+---
+
+## 1. High-Impact Checklist
+
+| # | Check                                  | Expected gain | Verify                                                 |
+| - | -------------------------------------- | ------------- | ------------------------------------------------------ |
+| 1 | Flash Attention enabled                | 15–30%        | Profiler shows `flash_attn` kernels                    |
+| 2 | Gradient all-reduce overlaps backward  | 10–25%        | All-reduce interleaved with backward in trace          |
+| 3 | TP within a node (≤ 8)                 | 5–20%         | Parallelism config                                     |
+| 4 | Sequence parallelism when TP > 1       | 3–8%          | Config                                                 |
+| 5 | BF16 actually in use                   | 10–50%        | No FP32 matmuls in forward/backward                    |
+| 6 | Full parameter sharding only if needed | 10–30%        | Optimizer-state sharding + checkpointing may be enough |
+| 7 | Asynchronous checkpointing             | 1–60%         | Step time stable during save                           |
+
+---
+
+## 2. Profiling
+
+Profile before optimizing.
+
+### Single-step capture
+
+```python
+from torch.profiler import profile, ProfilerActivity
+
+with profile(
+    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+    record_shapes=True,
+    with_stack=True,
+) as prof:
+    train_step()
+
+print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
+prof.export_chrome_trace("trace.json")   # open in chrome://tracing or Perfetto
+```
+
+### Multi-step capture with warmup
+
+```python
+from torch.profiler import profile, ProfilerActivity, schedule
+
+with profile(
+    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+    schedule=schedule(wait=5, warmup=2, active=3, repeat=1),
+    on_trace_ready=torch.profiler.tensorboard_trace_handler('./profiler_logs'),
+    record_shapes=True,
+    profile_memory=True,
+    with_stack=True,
+) as prof:
+    for step, batch in enumerate(dataloader):
+        train_step(batch)
+        prof.step()
+        if step >= 10:
+            break
+```
+
+### Breakdown buckets
+
+| Category      | Includes                                           | Healthy range |
+| ------------- | -------------------------------------------------- | ------------- |
+| Compute       | matmul, attention, FFN, activations                | 60–75%        |
+| Communication | all_reduce, reduce_scatter, all_gather, send, recv | 25–40%        |
+| Idle/overhead | gaps, sync, data loading, GC, checkpointing        | < 10%         |
+
+- Compute < 50% → communication-bound: reduce TP, enable overlap, check EFA.
+- Compute > 85% → near hardware ceiling: optimize kernels/precision.
+- Overhead > 15% → data pipeline, GC, or checkpoint stalls.
+
+---
+
+## 3. Parallelism Tuning
+
+### Tensor Parallelism (TP)
+
+Keep TP within a single instance. TP all-reduces on every transformer
+layer; NVLink is 900 GB/s per GPU on H100/H200 while EFA at 3,200 Gbps
+is shared across the 8 GPUs. Cross-instance TP consumes enormous
+bandwidth on every layer and rarely pays off.
+
+If TP > 8 (cross-node), that is typically the single largest MFU
+problem. Reduce TP to ≤ 8 and compensate with DP or PP.
+
+### Pipeline Parallelism (PP)
+
+1F1B bubble ratio (Narayanan et al. 2021):
+`(P − 1) / (P − 1 + M)` where `P` = pipeline stages, `M` =
+microbatches per step.
+
+| PP stages | Microbatches | Bubble |
+| --------- | ------------ | ------ |
+| 4         | 4            | 43%    |
+| 4         | 8            | 27%    |
+| 4         | 16           | 16%    |
+| 4         | 32           | 9%     |
+
+Reduce bubbles: more microbatches per flush, interleaved 1F1B, balance
+layers by compute cost (embeddings are lighter than transformer
+blocks), virtual pipeline stages.
+
+### Data Parallelism and Sharding
+
+| Strategy                     | Shards                | Overhead                   | Use when           |
+| ---------------------------- | --------------------- | -------------------------- | ------------------ |
+| Optimizer states only        | Optimizer             | Minimal                    | Default            |
+| + gradients                  | Optimizer + gradients | Moderate                   | More memory needed |
+| + parameters (full sharding) | Full                  | Heavy (all-gather / layer) | Last resort        |
+
+Full parameter sharding adds an all-gather per forward layer, costing
+10–30% MFU. Prefer optimizer-state sharding plus gradient checkpointing
+if the model fits.
+
+### Sequence Parallelism
+
+Distributes LayerNorm/Dropout activations across the TP group,
+replacing an all-reduce with reduce-scatter + all-gather. Net cost
+≈ 0. Enable whenever TP > 1.
+
+---
+
+## 4. Computation-Communication Overlap
+
+### Gradient all-reduce overlap
+
+Gradients become available layer by layer in backward. Communicating
+each layer's gradient during the next layer's backward compute delivers
+10–25% MFU.
+
+Without overlap: `[backward all layers] → [all-reduce all grads]`.
+With overlap: `[backward layer N] + [all-reduce layer N+1 grads]`
+simultaneously.
+
+**Verify**: all-reduce interleaved with backward kernels in the trace.
+A single all-reduce block after backward = no overlap.
+
+Most frameworks overlap by default (DDP bucketing). Usual culprit is
+custom training code calling `all_reduce` after the full backward.
+
+### Pipeline prefetch
+
+Overlap next microbatch's forward with current microbatch's backward.
+Costs memory for two microbatches.
+
+### Parameter prefetch (full sharding)
+
+With full parameter sharding, prefetch next layer's parameters during
+current layer's compute. Without prefetch, all-gather latency is fully
+exposed per layer.
+
+---
+
+## 5. Kernel and Precision
+
+### Flash Attention
+
+Fused kernel; avoids the O(n²) attention matrix. Memory O(n²) → O(n);
+throughput up.
+
+**Verify**: `flash_attn` or `flash_fwd` kernels in the trace. An
+`sdpa_attention` op or manual QKV matmul + softmax pattern = not
+active.
+
+Install: `pip install flash-attn --no-build-isolation`.
+
+### BF16
+
+H100/A100 deliver ~2× BF16 vs FP32. Accidental FP32 is a common silent
+MFU killer. Causes: `.float()` in custom code, loss functions that
+upcast, third-party layers without BF16. Check the trace for `float32`
+matmuls in the main forward/backward — there should be none.
+
+### FP8 (H100/H200)
+
+Up to 2× additional throughput over BF16. Requires per-tensor
+quantization and loss scaling. Not all workloads benefit — profile
+first.
+
+### Fused kernels
+
+Fusing small ops (LayerNorm + bias + activation; optimizer step)
+reduces launch overhead and memory bandwidth pressure. Most frameworks
+provide them; verify enabled. New fused kernels are a common regression
+source — compare MFU and loss before and after.
+
+---
+
+## 6. Memory for Larger Batches
+
+More free GPU memory → larger micro-batch → better utilization per
+kernel launch → higher MFU.
+
+### Gradient checkpointing
+
+| Strategy                                | Memory savings | Compute overhead |
+| --------------------------------------- | -------------- | ---------------- |
+| Full recomputation                      | 30–40%         | ~33%             |
+| Selective (LayerNorm, activations only) | 20–30%         | ~10–15%          |
+
+Selective is almost always better: recompute cheap ops, keep attention
+and MLP activations.
+
+### Micro-batch sizing
+
+1. Profile current GPU memory.
+2. Headroom = total − peak active − 10% margin.
+3. Increase micro-batch to fill headroom.
+4. Verify stability and loss at the new size.
+
+### CPU offloading
+
+Offload optimizer states to CPU. Transfers must overlap with compute,
+otherwise offload is a regression.
+
+---
+
+## 7. NCCL Tuning
+
+### Buffer size
+
+```bash
+export NCCL_BUFFSIZE=8388608   # 8 MB. Modern NCCL (≥ 2.19) often auto-tunes; measure before overriding.
+```
+
+### Protocol
+
+```bash
+export NCCL_PROTO=Simple       # Best for EFA on p4d/p5/p5e/p5en
+# LL and LL128 are low-latency small-message paths (usually auto)
+```
+
+### Algorithm benchmarking
+
+```bash
+NCCL_ALGO=Ring ./all_reduce_perf -b 1M -e 4G -f 2 -g 1
+NCCL_ALGO=Tree ./all_reduce_perf -b 1M -e 4G -f 2 -g 1
+```
+
+Pick the algorithm with highest `busbw` for the target message-size
+range.
+
+### GPU Direct RDMA
+
+```bash
+export NCCL_NET_GDR_LEVEL=SYS
+export NCCL_NET_GDR_READ=1
+export FI_EFA_USE_DEVICE_RDMA=1
+```
+
+Direct GPU-to-NIC transfer; bypasses CPU memory.
+
+---
+
+## 8. Infrastructure Placement
+
+### Topology-aware placement
+
+| Parallelism | Place on                | Rationale                                 |
+| ----------- | ----------------------- | ----------------------------------------- |
+| TP group    | Same node (NVLink)      | All-reduce / layer; maximum bandwidth     |
+| PP group    | Same rack / leaf switch | P2P between stages; moderate bandwidth    |
+| DP group    | Across racks            | Less-frequent all-reduce; fault isolation |
+
+### GPU-NIC affinity
+
+`nvidia-smi topo -m` shows which GPUs are closest to which NICs.
+Assign ranks so each GPU's cross-node traffic takes the nearest NIC.
+Some frameworks handle this automatically — verify in the launcher.
+
+---
+
+## References
+
+**Checklist / baselines (§1):**
+
+- PyTorch FSDP 57% MFU at 7B / 512 GPUs:
+  https://pytorch.org/blog/maximizing-training/
+- MegaScale 55.2% MFU at 175B / 12,288 GPUs (NSDI 2024):
+  https://www.usenix.org/conference/nsdi24/presentation/jiang-ziheng
+- Transformer FLOPs and `6N`:
+  https://www.adamcasson.com/posts/transformer-flops
+
+**Profiler (§2):**
+
+- PyTorch Profiler: https://docs.pytorch.org/docs/stable/profiler.html
+- TensorBoard profiler tutorial:
+  https://docs.pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
+- Perfetto: https://ui.perfetto.dev
+
+**Parallelism (§3):**
+
+- Megatron-LM 1F1B and bubble formula (Narayanan et al., SC 2021):
+  https://arxiv.org/abs/2104.04473
+- GPipe (Huang et al., NeurIPS 2019): https://arxiv.org/abs/1811.06965
+- ZeRO-1/2/3 (Rajbhandari et al., SC 2020):
+  https://arxiv.org/abs/1910.02054
+- PyTorch FSDP: https://docs.pytorch.org/docs/stable/fsdp.html
+- Megatron tensor parallelism (Shoeybi et al. 2019):
+  https://arxiv.org/abs/1909.08053
+- Megatron sequence parallelism (Korthikanti et al. 2022):
+  https://arxiv.org/abs/2205.05198
+
+**Overlap (§4):**
+
+- PyTorch DDP bucketed all-reduce:
+  https://docs.pytorch.org/docs/stable/notes/ddp.html
+- Zero Bubble Pipeline Parallelism (Qi et al. 2023):
+  https://arxiv.org/abs/2401.10241
+
+**Kernels / precision (§5):**
+
+- Flash Attention 1 (Dao et al., NeurIPS 2022):
+  https://arxiv.org/abs/2205.14135
+- Flash Attention 2 (Dao 2023): https://arxiv.org/abs/2307.08691
+- Flash Attention 3 (Shah et al. 2024): https://arxiv.org/abs/2407.08608
+- `flash-attn`: https://pypi.org/project/flash-attn/
+- NVIDIA Transformer Engine (FP8):
+  https://github.com/NVIDIA/TransformerEngine
+- Hopper architecture deep dive:
+  https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/
+
+**Memory (§6):**
+
+- PyTorch activation checkpointing:
+  https://docs.pytorch.org/docs/stable/checkpoint.html
+- Selective recomputation trade-off:
+  https://mbrenndoerfer.com/writing/activation-checkpointing-gradient-memory-selective-recomputation
+- Selective activation recomputation (Korthikanti et al. 2022):
+  https://arxiv.org/abs/2205.05198
+- DeepSpeed ZeRO-Offload:
+  https://www.deepspeed.ai/tutorials/zero-offload/
+
+**NCCL (§7):**
+
+- NCCL environment variables:
+  https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+- nccl-tests: https://github.com/NVIDIA/nccl-tests
+- AWS libfabric EFA variables:
+  https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl.html
+
+**Placement (§8):**
+
+- EC2 cluster placement groups:
+  https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html
+- HyperPod EC2 UltraCluster topology:
+  https://aws.amazon.com/ec2/instance-types/p5/
+- `nvidia-smi topo`:
+  https://docs.nvidia.com/deploy/nvidia-smi/index.html
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2a-regression.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2a-regression.md
new file mode 100644
index 00000000..ed1a3a2b
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2a-regression.md
@@ -0,0 +1,48 @@
+# Step 2A: Code or Configuration Regression
+
+A recent change correlates with an MFU drop.
+
+## Procedure
+
+1. Identify what changed (`git log`, configuration diff).
+2. Roll back to the last known-good state.
+3. Run a short benchmark of 50–100 steps and compare MFU before and
+   after.
+4. If MFU recovers, bisect the change.
+
+## Common Causes
+
+- Accidental FP32 operations introduced by type promotion in custom code.
+- Flash Attention disabled or removed after a dependency update.
+- Changes to micro-batch size, sequence length, or parallelism
+  dimensions.
+- NCCL or EFA plugin version change after node replacement.
+- A new custom kernel with a performance regression.
+
+## Version Drift Across Nodes
+
+A replaced node may ship with a different CUDA, NCCL, or EFA version
+than the rest of the cluster. Run the `hyperpod-version-checker` skill
+across all nodes to detect mismatches.
+
+## After Bisecting
+
+Once the offending change is identified:
+
+- **Code change**: review for type promotions, missing fused operators,
+  or changed parallelism dimensions.
+- **Dependency change**: pin the affected package versions; confirm
+  Flash Attention and the NCCL plugin are still present.
+- **Node replacement**: ensure the lifecycle scripts install an
+  identical software stack. Compare the replaced node against healthy
+  nodes.
+
+---
+
+## If this was not the root cause
+
+If no recent change is apparent or the rollback did not recover MFU:
+
+- `2g-tuning.md` — when MFU was low before the perceived degradation.
+- `2c-straggler.md` — when one node's metrics differ from its peers.
+- `2f-network.md` — when step-time variance is high.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2b-errors.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2b-errors.md
new file mode 100644
index 00000000..90005fe9
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2b-errors.md
@@ -0,0 +1,73 @@
+# Step 2B: Error-Driven Diagnosis
+
+Explicit errors are visible in training logs or `dmesg`.
+
+## CUDA or Xid Errors
+
+```bash
+# Run on the failing node via ssm-exec.sh:
+dmesg -T | grep 'NVRM: Xid' | tail -20
+nvidia-smi --query-gpu=index,ecc.errors.corrected.volatile.total,\
+ecc.errors.uncorrected.volatile.total --format=csv
+nvidia-smi --query-remapped-rows=gpu_bus_id,remapped_rows.correctable,\
+remapped_rows.uncorrectable,remapped_rows.pending,remapped_rows.failure \
+--format=csv
+```
+
+Xid codes 48, 64, 79, and 95 are critical and warrant immediate node
+eviction. Refer to `../failure-patterns.md` §1 for the full catalog.
+
+## NCCL Timeout or Hang
+
+The job is stuck in a collective operation because one rank is not
+participating.
+
+1. Capture stack traces on every node to identify the outlier:
+
+   ```bash
+   # Run on each node via ssm-exec.sh:
+   for pid in $(pgrep -f 'python.*train'); do
+     echo "=== PID $pid ==="
+     py-spy dump --pid "$pid"
+   done
+   ```
+
+   Note: `py-spy dump` briefly pauses the target process via `ptrace`.
+   On a running distributed job this can trigger the PyTorch NCCL
+   watchdog and abort the job. Coordinate with the job owner before
+   running, and see `../network-and-comms.md` §4 for the full
+   capture procedure and interpretation.
+
+2. Group the stacks by similarity. The majority pattern represents
+   healthy ranks; a minority pattern indicates the problem.
+3. The outlier rank's node is the likely root cause.
+4. Inspect that node's GPU health, NIC status, and `dmesg` for errors.
+5. If the outlier is part of a pipeline-parallel group, evict the
+   entire pipeline group. A stuck PP stage blocks every stage in that
+   group.
+
+Refer to `../network-and-comms.md` §4 for the full stack-trace
+aggregation procedure.
+
+## Out-of-Memory (OOM)
+
+- **GPU OOM**: reduce micro-batch size, enable gradient checkpointing,
+  and inspect custom code for memory leaks.
+- **Host OOM** (`free -h` shows swap use or killed processes): reduce
+  DataLoader workers, inspect preprocessing for memory leaks, and
+  verify optimizer-state sharding is active.
+
+Refer to `../node-diagnostics.md` §3 for memory-pressure diagnostics.
+
+---
+
+## If this was not the root cause
+
+If no clear error surfaces, or errors are secondary to another
+condition:
+
+- `2c-straggler.md` — when only one node reports errors.
+- `2d-gradual.md` — when errors correlate with temperature or memory
+  pressure trending over time.
+- `../failure-patterns.md` — for the full Xid catalog and gray-failure
+  signatures.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2c-straggler.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2c-straggler.md
new file mode 100644
index 00000000..b326a91c
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2c-straggler.md
@@ -0,0 +1,130 @@
+# Step 2C: Hardware Failure or Straggler
+
+A single degraded node in synchronized training drags all nodes to its
+speed. One node at 20% performance produces a whole-job MFU of 20%.
+
+## Step 1: Identify the straggler
+
+Extract per-rank step times from training logs. Most frameworks emit a
+line per iteration containing, for example, `step_time=0.45s`.
+
+```bash
+# Pull step times from a log and identify outlier ranks.
+grep -oP 'rank[=: ]*\K\d+|step_time[=: ]*\K[0-9.]+' "/path/to/train.log"
+```
+
+If the framework does not emit per-rank step times, add lightweight
+logging:
+
+```python
+import time
+import os
+
+rank = int(os.environ.get('RANK', 0))
+elapsed = time.time() - step_start
+if step % 10 == 0:
+    print(f"rank={rank} step={step} step_time={elapsed:.4f}")
+```
+
+Ranks whose step time exceeds `mean + 2 × stddev` are outliers.
+
+## Step 2: Map rank to node
+
+`node_id = rank // gpus_per_node` (typically 8).
+
+## Step 3: Inspect the suspect node
+
+Run the following via SSM:
+
+```bash
+# GPU clocks: compare against a healthy node.
+nvidia-smi --query-gpu=index,clocks.current.sm,clocks.max.sm,\
+clocks_event_reasons.hw_slowdown,clocks_event_reasons.hw_thermal_slowdown,\
+clocks_event_reasons.sw_power_cap --format=csv
+
+# NVLink errors
+nvidia-smi nvlink -e
+
+# Intra-node GPU bandwidth (when nccl-tests is installed)
+/path/to/nccl-tests/build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
+
+# EFA device count and port state
+fi_info -p efa | grep -c 'provider: efa'
+cat /sys/class/infiniband/*/ports/1/state
+```
+
+## Step 4: Compare metrics across nodes
+
+Run the same commands on a healthy node. The faulty node diverges.
+Inspect in this order, most-sensitive first:
+
+| # | Metric          | Command (via SSM)                                              | Problem indicator                      |
+| - | --------------- | -------------------------------------------------------------- | -------------------------------------- |
+| 1 | CPU usage       | `mpstat 1 3`                                                   | One node near 0% while others are busy |
+| 2 | GPU SM clock    | `nvidia-smi --query-gpu=index,clocks.current.sm --format=csv`  | Lower than peers                       |
+| 3 | GPU power draw  | `nvidia-smi --query-gpu=index,power.draw --format=csv`         | Significantly lower or higher          |
+| 4 | GPU duty cycle  | `nvidia-smi dmon -s u -c 5`                                    | Lower utilization than peers           |
+| 5 | NVLink errors   | `nvidia-smi nvlink -e`                                         | Non-zero counters                      |
+| 6 | EFA port errors | `cat /sys/class/infiniband/*/ports/1/counters/port_rcv_errors` | Non-zero                               |
+
+A single anomalous metric that persists for several minutes confirms
+the faulty node. Transient 1-to-2-second spikes are jitter and should
+be ignored.
+
+## Step 5: Evict and replace
+
+**Approval required.** The commands below change cluster state and
+will terminate any workload running on the affected node. Present
+them to the user and wait for explicit confirmation before running.
+Approval for earlier diagnostic steps does not carry over to this
+step.
+
+### Slurm
+
+```bash
+# Drain the node (new jobs will not be scheduled; existing jobs continue
+# until they complete or are cancelled):
+scontrol update nodename=<node> state=drain reason="MFU straggler"
+
+# After replacement, return the node to service:
+scontrol update nodename=<node> state=resume
+```
+
+### EKS
+
+```bash
+# Mark the node unschedulable:
+kubectl cordon <node-name>
+
+# Evict pods safely. --ignore-daemonsets leaves node-level DaemonSets
+# in place; --delete-emptydir-data acknowledges that emptyDir volumes
+# (including /dev/shm) on this node will be lost; increase
+# --grace-period for pods that need time to checkpoint cleanly.
+kubectl drain <node-name> \
+  --ignore-daemonsets \
+  --delete-emptydir-data \
+  --grace-period=60
+
+# After replacement, return the node to service:
+kubectl uncordon <node-name>
+```
+
+Once the node is drained, restart training from the latest checkpoint.
+
+## Further Reading
+
+- `../network-and-comms.md` — EFA diagnosis and NCCL debugging.
+- `../failure-patterns.md` — gray-failure signatures and Xid catalog.
+- `../node-diagnostics.md` — detailed GPU health checks.
+
+---
+
+## If this was not the root cause
+
+If no single node stands out on any metric:
+
+- `2f-network.md` — when variance is high but no single node is
+  consistently slow.
+- `2d-gradual.md` — when the drop is actually gradual rather than a
+  step function.
+- `2b-errors.md` — when errors appeared on any node during the sweep.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2d-gradual.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2d-gradual.md
new file mode 100644
index 00000000..6f535a43
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2d-gradual.md
@@ -0,0 +1,65 @@
+# Step 2D: Gradual Decline
+
+MFU is declining slowly over hours or days.
+
+## Thermal Throttling
+
+```bash
+# Run on multiple nodes via SSM and compare.
+nvidia-smi --query-gpu=index,temperature.gpu,clocks.current.sm,\
+clocks_event_reasons.hw_thermal_slowdown --format=csv
+```
+
+GPUs throttle in software at their Max Operating Temperature (88 °C on
+H100 SXM5) and in hardware at their Slowdown Temperature (92 °C). A
+node that consistently runs hotter than its peers is the straggler.
+
+**Short-term remediation — approval required.** Applying a lower
+power cap with `nvidia-smi -pl <watts>` changes runtime state and
+will reduce throughput on that node until reverted. The default power
+limit is shown by `nvidia-smi -q -d POWER` and can be restored with
+`nvidia-smi -pl <default_watts>`. In synchronized training, the cap
+will pace the rest of the job at the capped node's new throughput, so
+plan the change accordingly.
+
+**Long-term remediation.** Investigate rack cooling or drain and
+replace the node. Refer to `../node-diagnostics.md` §2 for the full
+threshold table and its source.
+
+## Memory Pressure
+
+```bash
+free -h              # Any swap usage is a problem.
+vmstat 1 5           # The si/so columns reveal active swapping.
+nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv
+```
+
+## Data Pipeline
+
+GPUs are starved of data. The signature is periodic idle gaps in GPU
+utilization that lengthen over time.
+
+```bash
+iostat -x 1 5        # Inspect await above 10 ms and high %util.
+df -h                # Full disk.
+# For FSx for Lustre:
+lctl get_param llite.*.stats 2>/dev/null
+```
+
+## Further Reading
+
+`../node-diagnostics.md` contains the detailed memory, thermal, and
+data-pipeline diagnostic procedures.
+
+---
+
+## If this was not the root cause
+
+If the decline is not thermal, memory, or data-related:
+
+- `2e-periodic.md` — when the "decline" is actually regular dips
+  (saw-tooth pattern).
+- `2c-straggler.md` — when one node consistently runs hotter than
+  peers.
+- `2a-regression.md` — when a code or configuration change predates
+  the decline.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2e-periodic.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2e-periodic.md
new file mode 100644
index 00000000..dfcc9eb5
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2e-periodic.md
@@ -0,0 +1,47 @@
+# Step 2E: Periodic Dips
+
+MFU drops at regular intervals.
+
+## Correlate Timing
+
+- **Checkpoint saves**. The leading cause of periodic dips.
+  Synchronous checkpointing stalls all GPUs while model state is
+  written to disk or Amazon S3. Larger models and slower storage make
+  the effect worse. Remediation: switch to asynchronous checkpointing
+  (supported by most frameworks). Measure the cost by comparing step
+  time during a checkpoint step to step time during a normal step.
+- **Evaluation steps**. Evaluation that runs on the training GPUs
+  interrupts training.
+- **Logging**. Synchronous metric uploads stall the training loop. Use
+  asynchronous logging.
+- **Garbage collection pauses**. Python garbage collection can stall
+  the Python process. Some training frameworks replace automatic GC
+  with `gc.disable()` plus explicit `gc.collect()` between training
+  steps. This is an advanced, framework-level change: it is a
+  code modification, not a diagnostic, and if long-lived reference
+  cycles accumulate it can lead to gradual host-memory growth.
+  Validate on a representative workload (at least one full epoch or
+  thousands of steps) before adopting on a production run.
+
+## Measure Directly
+
+```bash
+# Locate regular step-time spikes in training logs.
+grep -E 'step_time|iteration' "/path/to/train.log" | \
+  awk '{print NR, $NF}' | sort -k2 -n -r | head -20
+```
+
+If the spike interval matches the checkpoint frequency, the cause is
+confirmed.
+
+---
+
+## If this was not the root cause
+
+If the dips do not match any periodic event:
+
+- `2f-network.md` — when the apparent period is actually irregular
+  variance.
+- `2d-gradual.md` — when the baseline between dips is itself declining.
+- `../node-diagnostics.md` §4 — for a deeper data-pipeline
+  investigation.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2f-network.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2f-network.md
new file mode 100644
index 00000000..80405197
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2f-network.md
@@ -0,0 +1,63 @@
+# Step 2F: Network Variance
+
+Step times fluctuate. Different ranks are slow at different times.
+
+## EFA Health
+
+```bash
+# Run on each node via SSM.
+fi_info -p efa | head -10
+cat /sys/class/infiniband/*/ports/1/state   # Should report "4: ACTIVE"
+
+# EFA error counters
+for dev in /sys/class/infiniband/*/; do
+  echo "=== $(basename $dev) ==="
+  cat "$dev/ports/1/counters/port_rcv_errors" 2>/dev/null
+  cat "$dev/ports/1/counters/port_xmit_discards" 2>/dev/null
+done
+
+# Confirm NCCL is using EFA rather than falling back to TCP.
+# Inspect training logs for: "NET/OFI Using provider: efa"
+# A "NET/Socket" line indicates EFA is not in use.
+```
+
+## NCCL Cross-Node Bandwidth
+
+```bash
+# Run nccl-tests between two nodes to measure collective bandwidth.
+# Must be launched as a training job or via MPI.
+all_reduce_perf -b 1M -e 4G -f 2 -g 1
+# Compare the busbw column against expected values:
+#   p5, p5e, p5en (2 nodes): ~380–420 GB/s
+#   p4d, p4de   (2 nodes):  ~35–45 GB/s
+```
+
+## Common EFA Issues on HyperPod
+
+- **Security group missing self-referencing rule**. EFA requires all
+  traffic between nodes in the same group.
+- **NCCL not using EFA**. Confirm `FI_PROVIDER=efa` is set and the AWS
+  OFI NCCL plugin is installed at `/opt/amazon/ofi-nccl/lib/` (or
+  `/opt/aws-ofi-nccl/lib/` on older AMIs).
+- **Missing placement group**. All training nodes must be in an EC2
+  cluster placement group.
+- **Version mismatch**. Run the `hyperpod-version-checker` skill across
+  all nodes.
+
+## Further Reading
+
+`../network-and-comms.md` contains NCCL environment tuning, recommended
+variables per instance type, and topology diagnosis.
+
+---
+
+## If this was not the root cause
+
+If EFA and NCCL appear healthy but variance persists:
+
+- `2c-straggler.md` — variance may originate from a single drifting
+  node rather than the network as a whole.
+- `2e-periodic.md` — when the apparent variance is actually regular
+  dips.
+- `2g-tuning.md` — cross-node tensor parallelism causes heavy variance
+  on its own.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2g-tuning.md b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2g-tuning.md
new file mode 100644
index 00000000..11081b14
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-mfu-debugger/references/steps/2g-tuning.md
@@ -0,0 +1,69 @@
+# Step 2G: Configuration Tuning
+
+MFU was low from the start. This path is optimization, not failure
+diagnosis.
+
+## Step 1: Profile One Training Step
+
+Use PyTorch Profiler (or the framework's equivalent) to capture a
+single-step breakdown of compute, communication, and idle time.
+
+```python
+import torch.profiler
+
+with torch.profiler.profile(
+    activities=[torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA],
+    record_shapes=True,
+    with_stack=True,
+) as prof:
+    train_step()
+
+print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
+```
+
+## Step 2: Interpret the Breakdown
+
+| Compute % | Communication % | Diagnosis                                               |
+| --------- | --------------- | ------------------------------------------------------- |
+| 60–75%    | 25–40%          | Healthy balance                                         |
+| Below 50% | Above 50%       | Communication-bound. Reduce TP, enable overlap.         |
+| Above 85% | Below 15%       | Compute-bound. Optimize kernels or precision.           |
+| —         | —               | Idle above 15%: data loading, GC, or checkpoint stalls. |
+
+## Step 3: Apply Fixes in Order of Typical Impact
+
+1. **Enable Flash Attention** if not already active (15–30% gain for
+   long sequences).
+2. **Enable gradient all-reduce overlap**. The backward pass and
+   communication should be interleaved, not sequential (10–25% gain).
+3. **Right-size tensor parallelism**. Keep TP within a single node
+   (TP ≤ 8). Cross-node TP incurs a significant communication tax.
+4. **Enable sequence parallelism** when TP > 1 (near-zero cost).
+5. **Avoid full parameter sharding when unnecessary**. Optimizer-state
+   sharding plus gradient checkpointing is usually sufficient and
+   costs 10–30% less.
+6. **Maximize micro-batch size**. Use the largest value that fits in
+   memory.
+7. **Verify BF16 precision**. Accidental FP32 from type promotion in
+   custom code is a common silent performance killer.
+8. **Topology-aware placement**. TP groups on NVLink; PP groups within
+   a rack.
+
+## Further Reading
+
+`../optimization-guide.md` contains the full playbook, including the
+profiling workflow, NCCL tuning, and infrastructure-level optimizations.
+
+---
+
+## If this was not the root cause
+
+If the configuration already looks reasonable and MFU is still low:
+
+- `2f-network.md` — suboptimal EFA or NCCL setup appears identical to
+  "bad configuration" from the training-loop perspective.
+- `2c-straggler.md` — a hidden straggler caps MFU regardless of
+  configuration.
+- `../failure-patterns.md` §2 — for gray-failure signatures that
+  present as "it is just slow".
diff --git a/plugins/sagemaker-ai/skills/hyperpod-nccl/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-nccl/SKILL.md
new file mode 100644
index 00000000..79ab2146
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-nccl/SKILL.md
@@ -0,0 +1,273 @@
+---
+name: hyperpod-nccl
+description: Use for NCCL-specific failure on SageMaker HyperPod GPU clusters (EKS or Slurm, p4d/p4de/p5/p5e/p5en/p6) — training hang, AllReduce timeout, EFA/libfabric error, collective-op abort, distributed training not starting. Covers NCCL timeouts, rendezvous failures, NetworkPolicy blocking NCCL, /dev/shm/memlock issues, aws-ofi-nccl missing, EFA TCP fallback, EFA device-count mismatch, NCCL_SOCKET_IFNAME errors, headless service / DNS resolution, NVLS/PXN/topology tuning, NCCL version mismatch across pods, straggler detection, init-container failures. Do NOT use for single-node hardware faults (GPU XID / ECC / silent NaNs / row-remap) → use hyperpod-node-debugger § G. Do NOT use for cluster-wide SSM or EFA failures at cluster creation time → use hyperpod-cluster-debugger § F / § A.
+metadata:
+  version: "1.0.0"
+---
+
+# NCCL HyperPod Debugger
+
+Diagnose NCCL failures on SageMaker HyperPod (EKS and Slurm).
+
+**Clear separation of concerns:**
+
+- `scripts/nccl-diagnose.sh` is a **read-only signal collector**. It reads cluster state via AWS APIs, kubectl, and on-node SSM, then prints each detected issue as a `[FAIL]` line. Every `[FAIL]` line ends with a pointer of the form `→ references/<file>.md § <section>`. The script never prints remediation commands and never modifies cluster state.
+- `references/*.md` contains the full remediation runbook for every issue the script can detect — root cause, preconditions, exact commands, verification, rollback.
+
+---
+
+## Workflow (authoritative)
+
+1. **Collect inputs** — cluster name, region, namespace/job if EKS, exact error message from the customer's logs.
+2. **Run the diagnostic** (step 3 below). Do not skip this even if the customer already knows the cause; the script produces a stable list of findings the rest of the workflow depends on.
+3. **Read the script output top-to-bottom.** For every `[FAIL]` line, note the trailing `→ references/<file>.md § <section>` pointer. Ignore `[PASS]` lines; treat `[WARN]` as advisory.
+4. **Open each referenced section.** Use the `Read` tool on the exact file path. Do not paraphrase remediation from memory — read the current doc.
+5. **Present the remediation to the customer.** For each finding, state:
+   - What the script detected (copy the `[FAIL]` line verbatim).
+   - What the root cause is (from the referenced section).
+   - The exact command(s) to run.
+   - The blast radius (e.g. "this reboots node i-xxx", "this changes SG on all cluster nodes").
+6. **Wait for explicit customer approval** before running any command that modifies state. Prefer the least-destructive action first (investigate → reboot → replace).
+7. **Re-run the diagnostic** after remediation to confirm the `[FAIL]` is cleared. If not, iterate.
+
+If a finding has no matching section in references (should not happen — if it does, report it as a bug), say so clearly rather than inventing a fix.
+
+**Signal sourcing note.** HyperPod `list-cluster-events` (Check 3) reports **infrastructure-level state only** — lifecycle, bootstrap, EFA health-check, capacity, replacement, reboot, AMI rollback. It does **not** carry NCCL timeouts, GPU XID/ECC, or per-pod training signals. Those come from pod logs (Check 6), CloudWatch (Check 6b), on-node SSM probes (Check 8), and NCCL env audit (Check 7). If a customer reports a training-time NCCL issue and Check 3 shows "no events" — that is expected, not a clean bill of health.
+
+---
+
+## Prerequisites
+
+Required on the machine running the skill:
+
+- `aws` CLI v2.13+ — authenticated to the AWS account that owns the HyperPod cluster (verify with `aws sts get-caller-identity`).
+- `jq` — used for JSON parsing; hard prerequisite.
+- `python3` — used for safe JSON manipulation when building SSM payloads and parsing cluster events.
+- `bash` 4.2+ (AL2 bash 4.2 works; AL2023/Ubuntu/macOS all work).
+
+Required for EKS clusters:
+
+- `kubectl` — authenticated to the EKS cluster behind the HyperPod cluster. If `kubectl get nodes` fails, K8s checks (2, 2b, 5, 5b, 6, 7, 9) are skipped and the script reports it.
+
+Required for on-node hardware checks:
+
+- `session-manager-plugin` (AWS Systems Manager) — used to run diagnostic commands on compute nodes. If absent, SSM checks are skipped and the script reports it.
+
+See [references/operations.md § 1 Pre-Flight Checklist](references/operations.md) for setup commands.
+
+## Defaults
+
+What the script does when the customer does not specify:
+
+- **Region**: reads `$AWS_DEFAULT_REGION`; if unset, the operator must pass `--region` (the script exits with an error — no implicit region assumption).
+- **Orchestrator**: auto-detected from `aws sagemaker describe-cluster`; override with `--orchestrator eks|slurm`.
+- **Namespace scoping (EKS)**: all namespaces. Pass `--namespace <NS>` to scope checks to a single training namespace.
+- **Job scoping (EKS)**: all jobs. Pass `--job <JOB-NAME>` to scope to one job.
+- **Hardware-check node sampling**: 3 compute nodes via SSM. Pass `--sample-nodes N` (capped at 50) or `--node <INSTANCE-ID>` for a specific node. **Node probes run serially** — `--sample-nodes 10` can take up to ~30 minutes in the worst case (10 × 180s). Use `--node <ID>` to isolate a suspect node quickly.
+- **SSM command timeout**: 180 seconds per node (hardware diagnostics can take 30-120s on busy p5 nodes).
+- **CloudWatch log window**: last 2 hours for pattern scanning.
+- **Output colors**: ANSI colors on; set `TERM=dumb` or pipe to a non-TTY to disable.
+- **Read-only**: the script NEVER modifies cluster state and NEVER prints remediation commands. Remediation always comes from `references/*.md`.
+
+## Error Handling
+
+How the script responds to common failure modes:
+
+| Failure mode                                            | Script behavior                                                                  | What to tell the customer                                                            |
+| ------------------------------------------------------- | -------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------ |
+| `aws sts get-caller-identity` fails                     | Exit 1 with the AWS error message                                                | "Fix your AWS credentials (aws configure / aws sso login / AWS_PROFILE) and rerun."  |
+| `aws sagemaker describe-cluster` returns AccessDenied   | Warn, add issue `Missing IAM permission for sagemaker:DescribeCluster`, continue | "Add `sagemaker:DescribeCluster` and retry; see operations.md § 3."                  |
+| Cluster not found                                       | Exit 1 after listing available clusters in the region                            | "Confirm the HyperPod cluster name and region."                                      |
+| `kubectl` absent / not authenticated / cannot reach API | Warn, skip K8s checks, note in summary                                           | "Run `aws eks update-kubeconfig --name <EKS-NAME> --region <R>`."                    |
+| SSM plugin absent                                       | Warn, skip on-node hardware checks                                               | "Install session-manager-plugin; see operations.md § 4."                             |
+| SSM command times out (180s)                            | Return partial output, mark node as unreachable                                  | "Rerun with `--node <ID> --sample-nodes 1` to isolate; check SSM agent on the node." |
+| CloudWatch log group not found                          | Skip CloudWatch scan, continue                                                   | "Enable CloudWatch on the cluster; see operations.md § 5."                           |
+| Cluster events API returns throttling                   | Warn and continue with partial data                                              | "Rerun later; the script is idempotent and safe to rerun."                           |
+
+The script exits `0` when diagnostics complete (issues may still be present — check the summary). It exits `1` only when prerequisites are missing or the cluster cannot be reached at all.
+
+---
+
+## Step 1: Collect information
+
+Ask the customer for:
+
+- **HyperPod cluster name** (not the EKS cluster name):
+
+  ```bash
+  aws sagemaker list-clusters --region <REGION> --query 'ClusterSummaries[*].ClusterName'
+  ```
+
+- **AWS region** — e.g. `us-east-1`, `us-west-2`
+- **Namespace + job name** (optional, EKS) — scopes checks to a specific training job
+- **Error message** — copy the exact NCCL error from logs
+
+## Step 2: Authenticate kubectl (EKS only)
+
+```bash
+EKS_ARN=$(aws sagemaker describe-cluster --cluster-name <HYPERPOD-NAME> --region <REGION> \
+  --query 'Orchestrator.Eks.ClusterArn' --output text)
+EKS_NAME=$(echo "$EKS_ARN" | awk -F'/' '{print $NF}')
+aws eks update-kubeconfig --name "$EKS_NAME" --region <REGION>
+kubectl get nodes
+```
+
+## Step 3: Run the diagnostic (read-only)
+
+```bash
+# Basic diagnostic — prints findings and reference pointers:
+bash scripts/nccl-diagnose.sh --cluster <HYPERPOD-NAME> --region <REGION>
+
+# Scope to a specific EKS job/namespace:
+bash scripts/nccl-diagnose.sh --cluster <NAME> --region <REGION> \
+  --namespace <NS> --job <JOB-NAME>
+
+# Force orchestrator if auto-detect is wrong:
+bash scripts/nccl-diagnose.sh --cluster <NAME> --region <REGION> --orchestrator slurm
+
+# Hardware-check more nodes via SSM (default 3):
+bash scripts/nccl-diagnose.sh --cluster <NAME> --region <REGION> --sample-nodes 10
+
+# Check a specific node only:
+bash scripts/nccl-diagnose.sh --cluster <NAME> --region <REGION> --node i-0abc123def456
+```
+
+The script prints:
+
+| Tag      | Meaning                                                               |
+| -------- | --------------------------------------------------------------------- |
+| `[PASS]` | Check passed — no action needed                                       |
+| `[FAIL]` | Problem found — counted in `Issues Found`, includes reference pointer |
+| `[WARN]` | Advisory                                                              |
+| `[INFO]` | Informational                                                         |
+
+Priority labels in the summary:
+
+- **P0** — Fix immediately (blocks training)
+- **P1** — Fix soon (degraded or at-risk)
+- **P2** — Informational (review when convenient)
+
+## Step 4: Look up remediation and guide the customer
+
+Each `[FAIL]` line ends with `→ references/<file>.md § <section>`. For every finding:
+
+1. Open the referenced section with the `Read` tool (do not guess from memory — the doc is the source of truth).
+2. Read the full section: root cause, preconditions, exact commands, verification.
+3. Present to the customer: the finding, the root cause, the command(s) with concrete values (instance IDs, SG IDs, namespaces interpolated from the script output), and the blast radius.
+4. **Wait for explicit approval** before running any state-changing command. Destructive actions (`batch-reboot-cluster-nodes`, `batch-replace-cluster-nodes`, `kubectl delete networkpolicy`, SG rule changes) require extra care — describe exactly what they affect.
+5. Prefer the narrowest scope: investigate one node before rebooting, read one NetworkPolicy before deleting it, try reboot before replacement.
+6. After the customer runs the command, re-run the diagnostic and confirm the `[FAIL]` is cleared.
+
+---
+
+## Remediation index
+
+Where each class of finding is documented. The script itself points directly at the right section via `→ references/<file>.md § <section>` in each `[FAIL]` line.
+
+| Issue surfaced by script                               | Remediation section                                                                       |
+| ------------------------------------------------------ | ----------------------------------------------------------------------------------------- |
+| SG missing inbound/outbound self-reference             | [operations.md § Security Groups](references/operations.md)                               |
+| Blocking NetworkPolicy / allow-all policy missing      | [operations.md § NetworkPolicy](references/operations.md)                                 |
+| Slurm node DOWN/DRAINING                               | [operations.md § Slurm Node Management](references/operations.md)                         |
+| GPU XID / SYSTEM_ERROR / hardware fault                | [operations.md § Node Reboot & Replacement](references/operations.md)                     |
+| GPU row-remap pending/failed / DCGM Fail / silent NaNs | [debugging-guide.md § 21 GPU Row-Remap / DCGM Health](references/debugging-guide.md)      |
+| NCCL timeout / rendezvous / straggler                  | [debugging-guide.md § 1 NCCL Timeout / Rendezvous Hang](references/debugging-guide.md)    |
+| EFA configuration / not used                           | [debugging-guide.md § 6 EFA Configuration](references/debugging-guide.md)                 |
+| EFA TCP fallback                                       | [debugging-guide.md § 13 EFA TCP Fallback](references/debugging-guide.md)                 |
+| NCCL version mismatch across pods                      | [debugging-guide.md § 10 NCCL Version Mismatch](references/debugging-guide.md)            |
+| Container OOM (pod killed)                             | [debugging-guide.md § 4 Container OOM](references/debugging-guide.md)                     |
+| GPU OOM (CUDA out of memory)                           | [debugging-guide.md § 11 GPU OOM](references/debugging-guide.md)                          |
+| RDMA memlock / `/dev/shm` too small                    | [debugging-guide.md § 17 RDMA Memory Registration Failure](references/debugging-guide.md) |
+| MASTER_ADDR DNS / headless Service                     | [debugging-guide.md § 12 DNS Resolution Failure](references/debugging-guide.md)           |
+| NVLS / PXN / topology tuning                           | [debugging-guide.md § 19 Advanced NCCL Tuning](references/debugging-guide.md)             |
+| NCCL log pattern (any of 38)                           | [error-patterns-quick-ref.md](references/error-patterns-quick-ref.md)                     |
+| Performance / nccl-tests / bandwidth                   | [performance-testing.md](references/performance-testing.md)                               |
+
+---
+
+## IAM permissions required
+
+Read-only diagnostic needs:
+
+```json
+{
+  "Action": [
+    "sagemaker:DescribeCluster",
+    "sagemaker:ListClusterNodes",
+    "sagemaker:ListClusterEvents",
+    "ec2:DescribeSecurityGroups",
+    "ec2:DescribeSubnets",
+    "ec2:DescribeInstances",
+    "logs:DescribeLogStreams",
+    "logs:FilterLogEvents",
+    "ssm:StartSession",
+    "ssm:TerminateSession"
+  ]
+}
+```
+
+> SSM on HyperPod uses `start-session` against `sagemaker-cluster:<cluster-id>_<group>-<iid>` targets, not `send-command` against plain instance IDs (HyperPod's managed instance fleet does not expose bare instance IDs to customer `SendCommand` calls). Grant `ssm:StartSession` and `ssm:TerminateSession` — not `ssm:SendCommand` / `ssm:GetCommandInvocation`.
+
+If the customer plans to apply a remediation, they will additionally need the write permission relevant to that action (for example `ec2:AuthorizeSecurityGroupIngress`, `sagemaker:BatchRebootClusterNodes`, etc.). Point them to the specific action before they apply it.
+
+---
+
+## Scale strategy
+
+| Scope           | Method                                   | Coverage                 |
+| --------------- | ---------------------------------------- | ------------------------ |
+| All nodes       | `sagemaker:ListClusterNodes` (paginated) | 100% nodes               |
+| All K8s objects | `kubectl`                                | 100% pods/nodes/policies |
+| Hardware checks | SSM `--sample-nodes N` (default 3)       | Sampled                  |
+| All node logs   | CloudWatch                               | 100% nodes               |
+
+For 100+ node clusters use `--sample-nodes 10`, or `--node <ID>` for a specific suspect node.
+
+**256+ node clusters:** NCCL topology graph search can fail or hang when `memlock` is set to `unlimited` (GNU libc then reduces thread stack to 2 MB). Use `memlock=8388608` in pod `securityContext` or `/etc/security/limits.conf`. Also increase `NCCL_TIMEOUT` proportionally: `NCCL_TIMEOUT=$(( nodes * 5 + 600 ))`.
+
+---
+
+## Distributed framework guidance
+
+For NCCL tuning specific to **FSDP**, **DeepSpeed**, or **Megatron-LM**, see [references/debugging-guide.md § 18](references/debugging-guide.md) — framework-specific NCCL env vars, common failure patterns, and parallelism mapping for HyperPod clusters.
+
+---
+
+## Skill delegation
+
+| Issue type                                                                                     | Use                                                                |
+| ---------------------------------------------------------------------------------------------- | ------------------------------------------------------------------ |
+| Cluster creation / deployment failures                                                         | `hyperpod-cluster-debugger` skill (§ A / B / C / H + `--validate`) |
+| Post-deployment cluster-wide management                                                        | `hyperpod-cluster-debugger` skill                                  |
+| Node-level issues (not NCCL) — disk, lifecycle, hardware                                       | `hyperpod-node-debugger` skill                                     |
+| Trainium/Inferentia collective-comm issues (trn1/trn2/inf2 — AWS Neuron Collectives, not NCCL) | `hyperpod-node-debugger` § G.2                                     |
+| Shell access on nodes                                                                          | `hyperpod-ssm` skill                                               |
+| Software version comparison across nodes                                                       | `hyperpod-version-checker` skill                                   |
+| Diagnostic bundle for AWS Support                                                              | `hyperpod-issue-report` skill                                      |
+| Training performance / MFU degradation                                                         | `hyperpod-mfu-debugger` skill                                      |
+
+---
+
+## Escalate to AWS Support when
+
+1. All SG rules correct and EFA verified on-node but NCCL still times out.
+2. Hardware checks pass on all nodes but AllReduce hangs persist.
+3. `nccl-diagnose.sh` reports `Issues Found: 0` but training still fails.
+4. GPU XID errors persist after node replacement.
+5. Memlock and timeout tuned for 256+ nodes but topology search still hangs.
+
+Collect before escalating:
+
+```bash
+bash scripts/nccl-diagnose.sh --cluster <C> --region <R> --sample-nodes 10
+# Then use hyperpod-issue-report skill for comprehensive log collection
+```
+
+---
+
+## References
+
+- [references/error-patterns-quick-ref.md](references/error-patterns-quick-ref.md) — 38-pattern error table with root cause + remediation commands
+- [references/debugging-guide.md](references/debugging-guide.md) — step-by-step procedures per failure type (19 scenarios incl. NVLS/PXN/topology)
+- [references/performance-testing.md](references/performance-testing.md) — nccl-tests, bandwidth thresholds, straggler detection
+- [references/operations.md](references/operations.md) — Security Groups, NetworkPolicy, Slurm node management, node reboot/replacement
diff --git a/plugins/sagemaker-ai/skills/hyperpod-nccl/references/debugging-guide.md b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/debugging-guide.md
new file mode 100644
index 00000000..685f419c
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/debugging-guide.md
@@ -0,0 +1,1094 @@
+# NCCL HyperPod — Detailed Debugging Guide
+
+Detailed procedures for each failure type. See `SKILL.md` for the quick reference.
+
+## Table of Contents
+
+| #  | Section                                                                                                        | Key Symptoms                                        |
+| -- | -------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |
+| 1  | [NCCL Timeout / Rendezvous Hang](#1-nccl-timeout--rendezvous-hang)                                             | Training hangs, AllReduce stuck, rendezvous timeout |
+| 2  | [Security Group Self-Reference Rules](#2-security-group-self-reference-rules)                                  | NCCL always times out, new cluster                  |
+| 3  | [NCCL_SOCKET_IFNAME — Interface Selection](#3-nccl_socket_ifname--interface-selection)                         | Wrong NIC, binding to eth0 instead of EFA           |
+| 4  | [Container OOM (exit code 137)](#4-container-oom--pod-killed-mid-training-exit-code-137)                       | OOMKilled, exit code 137                            |
+| 5  | [Wrong Results — Gradient Sync](#5-wrong-results--gradient-sync-issues)                                        | Loss not converging, inconsistent results           |
+| 6  | [EFA Configuration](#6-efa-configuration)                                                                      | EFA not working, slow training, FI_PROVIDER         |
+| 7  | [Node Hardware Failures](#7-node-hardware-failures)                                                            | XID errors, ECC, NVLink errors                      |
+| 8  | [Slurm-Specific Procedures](#8-slurm-specific-procedures)                                                      | Slurm batch script, node management, RemoveIPC      |
+| 9  | [NCCL RAS — Live Job Health](#9-nccl-ras--live-job-health-nccl-224)                                            | Live health query, straggler detection              |
+| 10 | [NCCL Version Mismatch](#10-nccl-version-mismatch-nccl-function-not-found)                                     | `NCCL function not found`, mixed images             |
+| 11 | [GPU OOM — CUDA out of memory](#11-gpu-oom--cuda-out-of-memory--cudamalloc-failed)                             | `cudaMalloc failed`, VRAM exhausted                 |
+| 12 | [DNS Resolution Failure](#12-dns-resolution-failure-name-or-service-not-known)                                 | `Name or service not known`, headless service       |
+| 13 | [EFA TCP Fallback](#13-efa-tcp-fallback-netofi-using-tcp)                                                      | `NET/OFI Using TCP`, 10x slower                     |
+| 14 | [GPU P2P Access Blocked (ACS)](#14-gpu-p2p-access-blocked-acsiommu)                                            | P2P not supported, intra-node slow                  |
+| 15 | [Stale Shared Memory](#15-stale-shared-memory-unlink-shared-memory)                                            | `/dev/shm/nccl-*` errors, RemoveIPC                 |
+| 16 | [Host Firewall Blocking NCCL](#16-host-firewall-blocking-nccl-iptablesnftables)                                | iptables DROP/REJECT                                |
+| 17 | [RDMA Memory Registration Failure](#17-rdma-memory-registration-failure-ibv_reg_mr-failed)                     | `ibv_reg_mr failed`, memlock                        |
+| 18 | [Distributed Training Frameworks](#18-distributed-training-frameworks--nccl-tuning)                            | FSDP, DeepSpeed, Megatron-LM tuning                 |
+| 19 | [Advanced NCCL Tuning](#19-advanced-nccl-tuning-nvls-pxn-topology-cross-nic)                                   | NVLS, PXN, topology, cross-NIC                      |
+| 20 | [Pending / CrashLoopBackOff / Init-Container Failures](#20-pending--crashloopbackoff--init-container-failures) | Pods stuck Pending, init containers failing         |
+| 21 | [GPU Row-Remap / DCGM Health](#21-gpu-row-remap--dcgm-health-marginal-memory-silent-degrader)                  | Silent NaNs, pending row-remap, DCGM false-Pass     |
+
+---
+
+## 1. NCCL Timeout / Rendezvous Hang
+
+**Always start minimal:** Reproduce with 2 ranks and `torch.ones(100)` before debugging full training.
+
+```python
+import os, torch, torch.distributed as dist, datetime
+rank = int(os.environ.get('RANK', 0))
+world_size = int(os.environ.get('WORLD_SIZE', 2))
+master = os.environ.get('MASTER_ADDR', 'localhost')
+port  = os.environ.get('MASTER_PORT', '29500')
+dist.init_process_group('gloo',
+    init_method=f'tcp://{master}:{port}',
+    world_size=world_size, rank=rank,
+    timeout=datetime.timedelta(seconds=120))
+t = torch.ones(100) * rank
+dist.all_reduce(t, op=dist.ReduceOp.SUM)
+expected = sum(range(world_size))
+assert t[0].item() == expected, f"Got {t[0].item()}, expected {expected}"
+print(f"[Rank {rank}] [PASS] AllReduce PASSED", flush=True)
+dist.destroy_process_group()
+```
+
+**Debug env vars:**
+
+```bash
+export NCCL_DEBUG=INFO          # verbose NCCL output
+export NCCL_DEBUG_SUBSYS=ALL    # all subsystems
+export TORCH_DISTRIBUTED_DEBUG=DETAIL
+export NCCL_TIMEOUT=1800        # 30 min during debug
+export NCCL_DEBUG_FILE=/tmp/nccl_rank${RANK}.log
+```
+
+**Dump call stack of hung process:**
+
+```bash
+# Inside the pod (EKS):
+kubectl exec -n <ns> <pod> -- pip install py-spy -q
+kubectl exec -n <ns> <pod> -- py-spy dump --pid $(pgrep -f python | head -1)
+
+# On the node via SSM (both orchestrators):
+aws ssm start-session --target sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>
+# On node:
+py-spy dump --pid $(pgrep -f python | head -1)
+py-spy record -o /tmp/profile.svg --pid <PID> --duration 30
+```
+
+**Root cause matrix:**
+
+| Timeout fires when            | Root cause                          | Fix                                                              |
+| ----------------------------- | ----------------------------------- | ---------------------------------------------------------------- |
+| Before init completes         | SG missing self-ref / NetworkPolicy | Fix SG or remove blocking NetworkPolicy                          |
+| Before init completes         | Wrong MASTER_ADDR / DNS failure     | Fix headless service; use `<job>-0.<svc>.<ns>.svc.cluster.local` |
+| Before init completes         | WORLD_SIZE > actual pods            | Match WORLD_SIZE to `spec.completions`                           |
+| After init, during AllReduce  | One rank crashed (OOM/CUDA)         | Check pod logs for exit code 137                                 |
+| After init, during AllReduce  | Straggler node (slow NIC)           | Run nccl-tests, drain slow node                                  |
+| On large cluster (128+ nodes) | NCCL_TIMEOUT too low                | Set `NCCL_TIMEOUT = node_count * 5 + 600`                        |
+
+**Slurm MASTER_ADDR setup** (no headless service needed — Slurm resolves hostnames natively):
+
+```bash
+# In your sbatch script:
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -1)
+export MASTER_PORT=29500
+# Verify DNS works from all nodes:
+srun --overlap bash -c "nslookup $MASTER_ADDR"
+```
+
+**For 100+ node clusters — prioritized fix order:**
+
+1. Check `NCCL_TIMEOUT` (most common) — set `NCCL_TIMEOUT=$(( nodes * 5 + 600 ))`
+2. Check `memlock` — set to `8388608` (not `unlimited`) — see Section 17
+3. Run straggler detection — see `references/performance-testing.md` pairwise bandwidth test
+4. Check for NCCL version drift after rolling node replacements — see Section 10
+
+---
+
+## 2. Security Group Self-Reference Rules
+
+**Required rules — without these NCCL will always timeout:**
+
+```bash
+SG=$(aws sagemaker describe-cluster --cluster-name <C> --region <R> \
+  --query 'VpcConfig.SecurityGroupIds[0]' --output text)
+
+# Inbound (inter-node TCP/UDP):
+aws ec2 authorize-security-group-ingress --group-id $SG --region <R> \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"'$SG'"}]}]'
+
+# Outbound (EFA RDMA):
+aws ec2 authorize-security-group-egress --group-id $SG --region <R> \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"'$SG'"}]}]'
+
+# Also needed: outbound 0.0.0.0/0 for SageMaker/S3 API calls
+aws ec2 authorize-security-group-egress --group-id $SG --region <R> \
+  --ip-permissions '[{"IpProtocol":"-1","IpRanges":[{"CidrIp":"0.0.0.0/0"}]}]'
+```
+
+**Verify rules are present:**
+
+```bash
+aws ec2 describe-security-groups --group-ids $SG --region <R> \
+  --query 'SecurityGroups[0].{Inbound:IpPermissions,Outbound:IpPermissionsEgress}' \
+  --output json | python3 -c "
+import sys, json
+d = json.load(sys.stdin)
+def has_self_ref(rules, sg_id):
+    return any(any(p.get('GroupId') == sg_id for p in r.get('UserIdGroupPairs', []))
+               for r in rules)
+sg_id = '$SG'
+print('Inbound self-ref:', has_self_ref(d['Inbound'], sg_id))
+print('Outbound self-ref:', has_self_ref(d['Outbound'], sg_id))
+"
+```
+
+---
+
+## 3. NCCL_SOCKET_IFNAME — Interface Selection
+
+**On EFA nodes (p4d/p5), always set explicitly:**
+
+```bash
+# Correct for EFA nodes — exclude non-VPC interfaces:
+export NCCL_SOCKET_IFNAME=^lo,docker,efa,veth,virbr
+
+# Find the correct VPC interface name:
+ip -br addr show | grep -vE "^lo|docker|br-|virbr|veth|efa" | grep UP | awk '{print $1}'
+```
+
+**Validate the setting works (leaves at least one interface):**
+
+```bash
+# After setting NCCL_SOCKET_IFNAME, verify it leaves interfaces:
+PATTERN="${NCCL_SOCKET_IFNAME#^}"
+ip -br addr show | grep UP | awk '{print $1}' | \
+  grep -vE "$(echo "$PATTERN" | tr ',' '|')"
+# Must show at least one interface (e.g., ens5)
+```
+
+**Also set matching MPI variable:**
+
+```bash
+export OMPI_MCA_btl_tcp_if_include=ens5   # match your VPC ENI
+# OR:
+export OMPI_MCA_btl_tcp_if_exclude=lo,docker0,virbr0
+```
+
+---
+
+## 4. Container OOM — Pod Killed Mid-Training (exit code 137)
+
+**Symptom:** Pod status = OOMKilled, exit code 137. The Linux kernel killed the process due to cgroup memory limit.
+This is different from GPU OOM (see section 11).
+
+**Detect:**
+
+```bash
+# EKS: check container termination reason
+kubectl describe pod <POD> -n <NS> | grep -A5 "Last State:"
+# Shows: Reason: OOMKilled, Exit Code: 137
+
+# On node via SSM:
+dmesg | grep -i "oom\|killed process" | tail -10
+free -h
+```
+
+**Fix options (in order of impact):**
+
+```python
+# 1. Gradient checkpointing (most impact, slower backward pass)
+model.gradient_checkpointing_enable()
+
+# 2. FSDP (shard model across all GPUs in job)
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+model = FSDP(model, device_id=torch.cuda.current_device())
+
+# 3. Mixed precision (halve activation memory)
+from torch.cuda.amp import autocast, GradScaler
+scaler = GradScaler()
+with autocast():
+    loss = model(inputs)
+
+# 4. Reduce batch size
+batch_size = batch_size // 2  # halve until OOM resolves
+```
+
+```yaml
+# Increase K8s memory limits:
+resources:
+  limits:
+    memory: "64Gi"   # increase as needed
+    nvidia.com/gpu: "8"
+```
+
+---
+
+## 5. Wrong Results — Gradient Sync Issues
+
+**Verify AllReduce is actually happening:**
+
+```python
+def check_allreduce_consistency(tensor, name, rank, world_size):
+    """Verify all ranks have same values after AllReduce."""
+    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+    results = [None] * world_size
+    dist.all_gather_object(results, tensor.sum().item())
+    if rank == 0:
+        if len(set(round(r, 4) for r in results)) > 1:
+            print(f"[FAIL] INCONSISTENT '{name}': {results}", flush=True)
+        else:
+            print(f"[PASS] CONSISTENT '{name}': {results[0]:.4f}", flush=True)
+```
+
+**Check FSDP/DTensor placements:**
+
+```python
+from torch.distributed.tensor import DTensor
+for name, param in model.named_parameters():
+    if isinstance(param, DTensor):
+        print(f"[Rank {dist.get_rank()}] {name}: placements={param.placements}")
+    else:
+        print(f"[Rank {dist.get_rank()}] {name}: NOT sharded (unexpected for FSDP)")
+```
+
+**Print from all ranks in order (debugging):**
+
+```python
+def print_all_ranks(msg):
+    for r in range(dist.get_world_size()):
+        if dist.get_rank() == r:
+            print(f"[Rank {r}] {msg}", flush=True)
+        dist.barrier()
+```
+
+---
+
+## 6. EFA Configuration
+
+**Required for full performance on p4d/p5:**
+
+```bash
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1     # GPU Direct RDMA
+export NCCL_SOCKET_IFNAME=^lo,docker,efa,veth
+export NCCL_PROTO=simple            # optimal for EFA
+export NCCL_TIMEOUT=1800
+```
+
+**K8s pod spec for EFA (p4d has 4 EFA ports):**
+
+```yaml
+resources:
+  limits:
+    vpc.amazonaws.com/efa: 4        # p4d: 4, p5: 32
+  requests:
+    vpc.amazonaws.com/efa: 4
+```
+
+**Install EFA K8s device plugin:**
+
+```bash
+# Helm (recommended):
+helm repo add eks https://aws.github.io/eks-charts
+helm install aws-efa-k8s-device-plugin --namespace kube-system \
+  eks/aws-efa-k8s-device-plugin
+
+# kubectl:
+kubectl apply -f https://raw.githubusercontent.com/aws/aws-efa-k8s-device-plugin/main/main.yaml
+```
+
+**Verify EFA on node:**
+
+```bash
+fi_info -p efa                              # lists EFA endpoints
+cat /opt/amazon/efa_installed_packages      # EFA installer version
+lsmod | grep efa                            # kernel module loaded
+ls /dev/infiniband/uverbs*                  # device files exist
+nvidia-smi nvlink --status                  # NVLink (p4d/p5)
+```
+
+---
+
+## 7. Node Hardware Failures
+
+**XID error detection:**
+
+```bash
+# Via SSM on any node:
+nvidia-smi -q | grep -E "Xid|Error Type|ECC"
+dmesg | grep -iE "xid|nvrm|hardware" | tail -20
+nvidia-smi nvlink -e   # NVLink error counters
+```
+
+**Replace bad node via HyperPod API:**
+
+```bash
+# Get instance ID from K8s node name:
+kubectl get node <NODE_NAME> -o jsonpath='{.spec.providerID}' | cut -d'/' -f5
+
+# Reboot first (less disruptive, preserves data):
+aws sagemaker batch-reboot-cluster-nodes \
+  --cluster-name <C> --region <R> \
+  --node-ids '["<INSTANCE_ID>"]'
+
+# Full replacement (if reboot doesn't help):
+aws sagemaker batch-replace-cluster-nodes \
+  --cluster-name <C> --region <R> \
+  --node-ids '["<INSTANCE_ID>"]'
+```
+
+**Drain before replace (EKS):**
+
+```bash
+kubectl cordon <NODE_NAME>
+kubectl drain <NODE_NAME> --ignore-daemonsets --delete-emptydir-data
+```
+
+---
+
+## 8. Slurm-Specific Procedures
+
+**NCCL batch script template:**
+
+```bash
+#!/bin/bash
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus-per-node=8
+#SBATCH --job-name=nccl-training
+
+# EFA settings (p4d/p5):
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1
+export NCCL_SOCKET_IFNAME=^lo,docker,efa,veth
+export NCCL_TIMEOUT=1800
+export NCCL_DEBUG=WARN
+
+# Rendezvous (torchrun manages RANK/WORLD_SIZE automatically):
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -1)
+export MASTER_PORT=29500
+
+srun torchrun \
+  --nnodes=$SLURM_NNODES \
+  --nproc_per_node=8 \
+  --rdzv_backend=c10d \
+  --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
+  train.py
+```
+
+**Node management:**
+
+```bash
+sinfo -o "%N %T %30E"                           # all node states
+scontrol show node <NODE> | grep Reason          # why node is down
+scontrol update nodename=<NODE> state=resume     # resume drained node
+sudo systemctl restart slurmctld                 # fix stuck jobs
+sudo systemctl status slurmd                     # check on compute node
+squeue -o "%i %j %T %R %N" | grep PENDING       # stuck pending jobs
+```
+
+**systemd RemoveIPC (must be disabled for Slurm):**
+
+```bash
+# Check:
+grep RemoveIPC /etc/systemd/logind.conf
+
+# Fix:
+echo "RemoveIPC=no" >> /etc/systemd/logind.conf
+sudo systemctl restart systemd-logind
+```
+
+---
+
+## 9. NCCL RAS — Live Job Health (NCCL 2.24+)
+
+```bash
+# Query while training job is running:
+echo "verbose status" | nc localhost 28028
+
+# With ncclras binary:
+ncclras -h localhost -p 28028 -v
+
+# Inside K8s pod:
+kubectl exec -n <NS> <POD> -- \
+  sh -c "echo 'verbose status' | nc -w 3 localhost 28028"
+
+# JSON output (NCCL 2.28+):
+ncclras -f json | python3 -m json.tool
+
+# Monitor mode (NCCL 2.29+) — real-time events:
+ncclras -m
+# Shows: PEER_DEAD when a rank dies
+```
+
+**Interpret output:**
+
+- `RUNNING OK` — all ranks alive, progressing normally
+- `MISMATCH` — some ranks behind → possible straggler
+- `INCOMPLETE` — missing rank data → one rank unresponsive
+- `DEAD` / `PEER_DEAD` — rank process confirmed dead → this is why you're hanging
+
+---
+
+## 10. NCCL Version Mismatch (`NCCL function not found`)
+
+**Symptom:** `NCCL function not found` or `Incompatible NCCL version` at job startup.
+**Cause:** Different NCCL builds across nodes — mixed container images or manual installs.
+
+**Diagnose:**
+
+```bash
+# Check NCCL version per running pod:
+for pod in $(kubectl get pods -n <NS> -l job-name=<JOB> --no-headers | awk '{print $1}'); do
+    echo -n "$pod: "
+    kubectl exec -n <NS> "$pod" -- \
+        python3 -c "import torch; print(torch.cuda.nccl.version())" 2>/dev/null \
+        || echo "unavailable"
+done
+
+# Check via library file:
+kubectl exec -n <NS> <POD> -- \
+    find /usr/local/cuda/lib64 /usr/lib -name "libnccl.so*" 2>/dev/null | head -3
+
+# Check CUDA driver version per node:
+kubectl get nodes -o custom-columns=\
+'NAME:.metadata.name,DRIVER:.metadata.labels.nvidia\.com/cuda\.driver-version' \
+2>/dev/null || kubectl get nodes -o wide
+```
+
+**Fix:**
+
+```bash
+# All pods in a job MUST use identical container images.
+# Verify your job spec uses the same image for all replicas:
+kubectl get pod -n <NS> -l job-name=<JOB> \
+    -o jsonpath='{range .items[*]}{.metadata.name}: {.spec.containers[0].image}{"\n"}{end}'
+# Every line must show the same image:tag
+
+# If different, update your job spec to pin to one image:
+# spec.template.spec.containers[0].image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-gpu-py311-cu121-ubuntu20.04-ec2
+```
+
+**Common cause on HyperPod:** Rolling node replacement installs a new AMI with a different NCCL version while old nodes are still in the cluster. Use lifecycle scripts to pin NCCL versions.
+
+---
+
+## 11. GPU OOM — `CUDA out of memory` / `cudaMalloc failed`
+
+**Symptom:** `CUDA out of memory`, `cudaMalloc failed`, or `RuntimeError: CUDA error: out of memory`.
+This is GPU VRAM exhaustion — distinct from container OOMKill (section 4).
+The process does NOT get killed by the kernel; PyTorch raises a Python exception.
+
+**Diagnose:**
+
+```bash
+# Check GPU memory usage on all GPUs:
+kubectl exec -n <NS> <POD> -- \
+    nvidia-smi --query-gpu=index,name,memory.used,memory.total,utilization.gpu \
+    --format=csv,noheader
+
+# In training script — add before suspected OOM:
+import torch
+for i in range(torch.cuda.device_count()):
+    used = torch.cuda.memory_allocated(i) / 1e9
+    reserved = torch.cuda.memory_reserved(i) / 1e9
+    total = torch.cuda.get_device_properties(i).total_memory / 1e9
+    print(f"GPU {i}: allocated={used:.1f}GB reserved={reserved:.1f}GB total={total:.1f}GB")
+    print(torch.cuda.memory_summary(i))
+```
+
+**Fix options (in order of impact):**
+
+```python
+# 1. Gradient checkpointing — trade compute for memory (most impactful)
+model.gradient_checkpointing_enable()
+
+# 2. ZeRO optimizer — shard optimizer states across ranks (DeepSpeed)
+# In deepspeed config:
+# "zero_optimization": {"stage": 3}   # ZeRO-3: shards params, grads, optimizer states
+
+# 3. FSDP — shard model weights across all GPUs
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+model = FSDP(model)
+
+# 4. Mixed precision — halve activation memory
+from torch.cuda.amp import autocast
+with autocast(dtype=torch.bfloat16):
+    loss = model(inputs)
+
+# 5. Reduce batch size — simplest fix
+batch_size = batch_size // 2
+
+# 6. Clear cache between steps (if fragmentation is the issue)
+torch.cuda.empty_cache()
+```
+
+**Memory fragmentation fix:**
+
+```python
+# If OOM happens after many steps (fragmentation):
+import gc
+gc.collect()
+torch.cuda.empty_cache()
+# Or: set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+```
+
+---
+
+## 12. DNS Resolution Failure (`Name or service not known`)
+
+**Symptom:** `Name or service not known`, `getaddrinfo failed`, or rendezvous hangs forever.
+**Cause:** MASTER_ADDR hostname cannot be resolved. Most common on EKS when no headless Service exists.
+
+**Diagnose:**
+
+```bash
+# Check DNS from inside a pod:
+kubectl exec -n <NS> <POD> -- nslookup $MASTER_ADDR
+kubectl exec -n <NS> <POD> -- getent hosts $MASTER_ADDR
+
+# Check if headless service exists:
+kubectl get svc -n <NS> -o wide | grep None
+# Should show: ClusterIP: None with selector matching training pods
+
+# Check CoreDNS is healthy:
+kubectl get pods -n kube-system -l k8s-app=kube-dns
+kubectl logs -n kube-system -l k8s-app=kube-dns --tail=20
+```
+
+**Fix:**
+
+```yaml
+# Create headless service for training job DNS:
+apiVersion: v1
+kind: Service
+metadata:
+  name: my-training-svc
+  namespace: <NS>
+spec:
+  clusterIP: None
+  selector:
+    app: my-training-job   # must match training pod labels
+  ports:
+  - port: 29500
+    name: nccl-rendezvous
+```
+
+```bash
+# Set MASTER_ADDR using the service DNS:
+export MASTER_ADDR="<job-name>-0.<service-name>.<namespace>.svc.cluster.local"
+```
+
+---
+
+## 13. EFA TCP Fallback (`NET/OFI Using TCP`)
+
+**Symptom:** In NCCL_DEBUG=INFO output, you see `NET/OFI Using TCP` instead of `NET/OFI Using EFA`.
+Training runs but at 10-100x lower bandwidth than expected.
+
+**Diagnose:**
+
+```bash
+# Check if EFA device plugin is installed:
+kubectl get daemonset -A | grep -i efa
+
+# Check if pod requests EFA:
+kubectl get pod <POD> -n <NS> -o jsonpath='{.spec.containers[0].resources.limits}'
+# Must include: vpc.amazonaws.com/efa
+
+# Check EFA env vars:
+kubectl exec -n <NS> <POD> -- env | grep FI_
+
+# Check on node via SSM:
+fi_info -p efa  # Must list EFA endpoints
+```
+
+**Fix checklist:**
+
+```bash
+# 1. Install EFA device plugin:
+helm repo add eks https://aws.github.io/eks-charts
+helm install aws-efa-k8s-device-plugin --namespace kube-system eks/aws-efa-k8s-device-plugin
+
+# 2. Request EFA in pod spec:
+# resources.limits: {vpc.amazonaws.com/efa: 4}  # p4d=4, p5=32
+
+# 3. Set env vars:
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1
+export NCCL_SOCKET_IFNAME=^lo,docker,efa,veth
+
+# 4. Ensure aws-ofi-nccl plugin is in container:
+# find /opt/amazon -name "libnccl-net.so" 2>/dev/null
+```
+
+---
+
+## 14. GPU P2P Access Blocked (ACS/IOMMU)
+
+**Symptom:** `NCCL WARN P2P not supported between dev X and dev Y` or `peer access is not supported`.
+Intra-node AllReduce is 10-50x slower because GPU Direct P2P transfers are blocked by PCI ACS.
+
+**Diagnose:**
+
+```bash
+# Check ACS on node via SSM:
+lspci -vvv 2>/dev/null | grep -A20 "PCI bridge" | grep "ACSCtl:"
+# If "SrcValid+" appears → ACS is enabled → P2P blocked
+
+# Check IOMMU:
+dmesg | grep -i iommu
+grep -oE "intel_iommu=[^ ]+" /proc/cmdline
+
+# Check P2P topology:
+nvidia-smi topo -m
+# NV# = NVLink (fast), PIX/PXB/PHB = PCIe (slow)
+```
+
+**Fix:**
+
+```bash
+# Disable ACS on NVIDIA GPU upstream bridges only — scoping to 10de: avoids
+# weakening IOMMU isolation on unrelated PCI devices.
+for BDF in $(lspci -D -d 10de: | awk '{print $1}'); do
+  sudo setpci -s "$BDF" ECAP_ACS+0x6.w=0000 2>/dev/null
+done
+
+# Add to lifecycle script for persistence (same NVIDIA-only scope):
+echo 'for BDF in $(lspci -D -d 10de: | awk "{print \$1}"); do setpci -s $BDF ECAP_ACS+0x6.w=0000 2>/dev/null; done' \
+  >> /opt/ml/scripts/on_create.sh
+```
+
+> **WARNING:** disabling ACS weakens IOMMU isolation for the affected PCI bridges.
+> Do not apply cluster-wide on a multi-tenant host. Scope to GPU bridges only,
+> as shown above.
+
+---
+
+## 15. Stale Shared Memory (`unlink shared memory`)
+
+**Symptom:** `unlink shared memory /dev/shm/nccl-* failed: No such file` or new training job
+fails with `File exists` on /dev/shm/nccl-* files left by a previous crash.
+
+**Cause:** Either systemd `RemoveIPC=yes` (default on RHEL/Amazon Linux) deletes NCCL shm
+mid-training, or a crashed training process left orphaned shm files.
+
+**Diagnose:**
+
+```bash
+# Check on node:
+ls -la /dev/shm/nccl-*
+grep RemoveIPC /etc/systemd/logind.conf
+```
+
+**Fix:**
+
+```bash
+# Clean up stale files (safe when no training running):
+rm -f /dev/shm/nccl-*
+
+# Prevent systemd from deleting shm mid-training:
+echo "RemoveIPC=no" >> /etc/systemd/logind.conf
+sudo systemctl restart systemd-logind
+
+# Add to lifecycle script:
+echo 'echo "RemoveIPC=no" >> /etc/systemd/logind.conf && systemctl restart systemd-logind' \
+  >> /opt/ml/scripts/on_create.sh
+```
+
+---
+
+## 16. Host Firewall Blocking NCCL (iptables/nftables)
+
+**Symptom:** NCCL timeout even though SG rules and NetworkPolicy are correct.
+Root cause: host-level iptables or nftables DROP/REJECT rules blocking NCCL ports.
+
+**Diagnose:**
+
+```bash
+# On node via SSM:
+iptables -L -n | grep -E "DROP|REJECT"
+nft list ruleset 2>/dev/null | grep -E "drop|reject"
+```
+
+**Fix:**
+
+```bash
+# Identify the specific blocking rule, then delete only that rule:
+iptables -L -n --line-numbers
+iptables -D INPUT <rule_number>
+
+# Preferred: add explicit allow for NCCL ports instead of touching existing rules:
+iptables -I INPUT -p tcp --dport 29400:29500 -j ACCEPT
+iptables -I INPUT -p tcp --dport 28028 -j ACCEPT  # NCCL RAS
+```
+
+> **IMPORTANT:** do not run `iptables -F` on an EKS worker. It flushes
+> kube-proxy's service rules and VPC CNI NetworkPolicy enforcement, breaking
+> pod networking cluster-wide. Always delete individual rules by line number.
+
+---
+
+## 17. RDMA Memory Registration Failure (`ibv_reg_mr failed`)
+
+**Symptom:** `NCCL WARN Call to ibv_reg_mr failed` followed by EFA falling back to TCP — training continues but at 10-100x lower bandwidth.
+
+**Cause:** The Linux `memlock` limit prevents the EFA driver from pinning memory for RDMA DMA transfers. With `memlock=0` or very low values, EFA cannot register any memory buffers.
+
+**Diagnose:**
+
+```bash
+# Check current memlock limit:
+ulimit -l
+# Should be: unlimited or ≥8388608 (8GB in KB)
+# If 0 or 64 → FAIL
+
+# Check on the actual node via SSM:
+aws ssm start-session --target sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>
+# On node:
+ulimit -l
+cat /proc/$(pgrep -f python | head -1)/limits | grep "Max locked"
+
+# In NCCL debug output (NCCL_DEBUG=INFO):
+# "NCCL WARN Call to ibv_reg_mr failed, got error (12)" → errno 12 = ENOMEM (memlock)
+```
+
+**Fix — immediate (session only):**
+
+```bash
+ulimit -l 8388608       # 8GB in KB
+# OR:
+ulimit -l unlimited     # See warning below
+```
+
+**Fix — permanent (system-wide):**
+
+```bash
+# Add to /etc/security/limits.conf:
+echo "* soft memlock 8388608" >> /etc/security/limits.conf
+echo "* hard memlock 8388608" >> /etc/security/limits.conf
+# Requires re-login to take effect
+
+# For Slurm: add to /etc/slurm/prolog.sh
+echo "ulimit -l 8388608" >> /etc/slurm/prolog.sh
+```
+
+**Fix — K8s pod spec:**
+
+```yaml
+# Add IPC_LOCK capability to allow large memlock:
+securityContext:
+  capabilities:
+    add: ["IPC_LOCK"]
+```
+
+**Note on `memlock=unlimited`:** GNU libc sets a hard limit of 2MB stack for threads when memlock is unlimited (a known quirk). For NCCL this means background threads may get only 2MB stack, which can cause topology graph search failures on large clusters. Use a large fixed value (8388608) instead of unlimited.
+
+**Verify fix worked:**
+
+```bash
+# After fix, NCCL_DEBUG=INFO should show:
+# "NCCL INFO NET/OFI Using EFA RDMA" (not TCP fallback)
+# No more "ibv_reg_mr failed" warnings
+
+# Check effective bandwidth after fix:
+/opt/nccl-tests/build/all_reduce_perf -b 1G -e 8G -f 2 -g 1
+# Should match expected algbw for your instance type
+```
+
+---
+
+## 18. Distributed Training Frameworks — NCCL Tuning
+
+NCCL issues often surface differently depending on the distributed training framework. Framework-specific guidance:
+
+### FSDP (Fully Sharded Data Parallel — PyTorch native)
+
+**Common NCCL issues with FSDP:**
+
+| Symptom                                      | Cause                               | Fix                                                                            |
+| -------------------------------------------- | ----------------------------------- | ------------------------------------------------------------------------------ |
+| Hang at `_init_intra_and_inter_node_groups`  | NCCL can't form process groups      | Check `MASTER_ADDR`, `MASTER_PORT`, firewall rules, and headless service (EKS) |
+| OOM during FSDP wrapping                     | All-gather materializes full params | Use `sharding_strategy=FULL_SHARD`, enable `cpu_offload` if needed             |
+| Slow FSDP training vs DDP                    | Excessive all-gather/reduce-scatter | Tune `limit_all_gathers=True`, increase `forward_prefetch=True`                |
+| `NCCL watchdog timeout` during checkpointing | Distributed checkpoint blocks NCCL  | Use `StateDictType.SHARDED_STATE_DICT` for async checkpoint save               |
+
+**Recommended NCCL env vars for FSDP on HyperPod:**
+
+```bash
+export NCCL_SOCKET_IFNAME=^lo,docker
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1
+export NCCL_ALGO=Ring           # Ring is generally better for FSDP all-gather patterns
+export NCCL_PROTO=Simple        # Simple protocol for large message FSDP comms
+export NCCL_TIMEOUT=1800        # 30 min — FSDP checkpoint can be slow at scale
+```
+
+### DeepSpeed
+
+**Common NCCL issues with DeepSpeed:**
+
+| Symptom                                       | Cause                                 | Fix                                                                                                                                                                    |
+| --------------------------------------------- | ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `RuntimeError: NCCL communicator was aborted` | Timeout during ZeRO all-gather        | Increase `NCCL_TIMEOUT`; check for straggler nodes                                                                                                                     |
+| OOM with ZeRO Stage 3                         | Parameter partitioning + NCCL buffers | Reduce `stage3_max_live_parameters`, enable `offload_optimizer`                                                                                                        |
+| Slow DeepSpeed init on 100+ nodes             | Sequential NCCL group creation        | Set `TORCH_NCCL_ASYNC_ERROR_HANDLING=1` (was `NCCL_ASYNC_ERROR_HANDLING` on PyTorch < 2.2 — old name still works but deprecated), increase `init_timeout` in ds_config |
+| `ncclInternalError` with pipeline parallelism | Cross-node P2P fails                  | Ensure `NCCL_P2P_LEVEL=NVL` for intra-node, check EFA for inter-node                                                                                                   |
+
+**DeepSpeed config tuning for HyperPod:**
+
+```json
+{
+  "comms_config": {
+    "comms_backend": "nccl",
+    "timeout": 1800
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "stage3_max_live_parameters": 1e8,
+    "stage3_prefetch_bucket_size": 5e7,
+    "reduce_bucket_size": 5e8
+  }
+}
+```
+
+### Megatron-LM
+
+**Common NCCL issues with Megatron-LM:**
+
+| Symptom                                     | Cause                                           | Fix                                                                |
+| ------------------------------------------- | ----------------------------------------------- | ------------------------------------------------------------------ |
+| Hang at `initialize_model_parallel`         | NCCL group creation fails across nodes          | Verify world size = TP \* PP \* DP, check network connectivity     |
+| Slow tensor-parallel matmul                 | NCCL all-reduce on small tensors is inefficient | Increase TP group size to stay intra-node (TP ≤ GPUs/node)         |
+| Pipeline bubble > 40%                       | PP schedule inefficiency                        | Reduce PP stages, increase micro-batches, try interleaved schedule |
+| `ncclGroupEnd failed` during 3D parallelism | Too many simultaneous NCCL groups               | Ensure `NCCL_MAX_NCHANNELS=2` for memory-constrained setups        |
+
+**Megatron-LM parallelism mapping for HyperPod:**
+
+```
+Rule of thumb:
+  TP (tensor parallel) = within a single node (8 GPUs on p5)
+  PP (pipeline parallel) = across nodes (minimizes cross-node comms volume)
+  DP (data parallel) = remaining nodes
+  
+  World size = TP × PP × DP
+  Example: 32 p5.48xlarge (256 GPUs)
+    TP=8, PP=4, DP=8 → 8×4×8 = 256
+```
+
+---
+
+## 19. Advanced NCCL Tuning (NVLS, PXN, Topology, Cross-NIC)
+
+### NVLS — NVLink SHARP (GPU-to-GPU hardware offload)
+
+NVLS is NVIDIA's in-network aggregation over NVLink, enabled by default in NCCL 2.19+. It speeds up small-message AllReduce on H100/H200 nodes but **requires matching driver and container versions**. On HyperPod, driver/container mismatch is the #1 cause of NVLS-related hangs.
+
+**Symptoms:**
+
+- Hang inside `ncclAllReduce` on p5/p5e/p5en
+- `NCCL INFO ... NVLS ... failed`
+- Fine on 1 node, hang on 2+ nodes
+
+**Diagnosis:**
+
+```bash
+# Check NCCL version (container side)
+python3 -c "import torch; print(torch.cuda.nccl.version())"
+# Check driver version (node side, via SSM)
+nvidia-smi --query-gpu=driver_version --format=csv
+```
+
+**Mitigations:**
+
+1. Disable NVLS temporarily to isolate:
+
+   ```bash
+   export NCCL_NVLS_ENABLE=0
+   ```
+
+2. Pin NCCL version across all pods/jobs (match container image digest, not tag).
+3. Upgrade the NVIDIA driver on the AMI via `UpdateClusterSoftware` if the container expects a newer driver.
+
+### PXN — P2P Cross-NUMA (p5.48xlarge optimal config)
+
+PXN lets NCCL route inter-node traffic via an intermediary GPU on a different NUMA node to maximize NIC utilization. For p5.48xlarge (8 GPUs, 32 EFA), `NCCL_PXN_LEVEL=2` and `NCCL_CROSS_NIC=1` are recommended.
+
+```bash
+# Recommended defaults for p5.48xlarge
+export NCCL_PXN_LEVEL=2
+export NCCL_CROSS_NIC=1
+# Optional: increase channel count for large messages
+export NCCL_MIN_NCHANNELS=4
+```
+
+If these cause regressions on smaller jobs (< 16 nodes), set `NCCL_PXN_LEVEL=0` and measure.
+
+### NCCL_TOPO_FILE — Custom Topology
+
+NCCL auto-discovers topology on p-family instances and usually picks the right plan. Use a custom topology file only when:
+
+- Running in containers that hide the PCIe topology from NCCL
+- Using an instance type NCCL doesn't recognize
+- Debugging suboptimal ring/tree selection
+
+To export the topology NCCL sees for manual inspection:
+
+```bash
+export NCCL_TOPO_DUMP_FILE=/tmp/nccl-topo.xml
+# Run any NCCL op (e.g., all_reduce_perf), then inspect /tmp/nccl-topo.xml
+```
+
+Do **not** ship a hand-edited topology file unless you've confirmed the default is wrong — this is an advanced-user escape hatch.
+
+### NCCL_SOCKET_FAMILY — IPv4 Forcing
+
+Dual-stack environments (IPv6 enabled on the VPC but IPv4 intended for NCCL) can cause silent TCP fallback. Force IPv4:
+
+```bash
+export NCCL_SOCKET_FAMILY=AF_INET
+```
+
+### Mixed Instance Families
+
+**Do not** mix p5 and p4d in the same NCCL communicator. Asymmetric topology causes algorithm selection failures (`NCCL WARN ... unable to find a working algorithm`). Launch separate jobs per instance family.
+
+### SHARP on EFA
+
+SHARP (Scalable Hierarchical Aggregation and Reduction Protocol) is **not supported on EFA**. If your job environment sets `NCCL_COLLNET_ENABLE=1` on HyperPod, disable it:
+
+```bash
+export NCCL_COLLNET_ENABLE=0
+```
+
+### Instance family EFA counts (reference)
+
+Used for validating that all EFA devices are attached and visible. Run
+`ls /dev/infiniband/uverbs* | wc -l` on a node and compare:
+
+| Instance type | Expected EFA count |
+| ------------- | ------------------ |
+| p5.48xlarge   | 32                 |
+| p5e.48xlarge  | 32                 |
+| p5en.48xlarge | 16                 |
+| p4d.24xlarge  | 4                  |
+| p4de.24xlarge | 4                  |
+| trn1.32xlarge | 8                  |
+| trn2.48xlarge | 16                 |
+
+Mismatch → EFA driver not loaded, or a subset of NICs didn't attach at boot. Replace the node.
+
+---
+
+## 20. Pending / CrashLoopBackOff / Init-Container Failures
+
+Pod lifecycle failures surface as `Pending`, `CrashLoopBackOff`, or stuck in an init container. These are NOT NCCL bugs per se — they block the NCCL job from starting. Diagnose in this order:
+
+### Pending pods
+
+```bash
+# Why is it pending?
+kubectl describe pod <POD> -n <NS> | sed -n '/Events:/,$p' | head -40
+```
+
+Common reasons and where to fix:
+
+| Event message                                                                  | Root cause                                                       | Where to fix                                                                                                    |
+| ------------------------------------------------------------------------------ | ---------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------- |
+| `0/N nodes are available: N Insufficient <resource>`                           | Not enough CPU/mem/GPU free                                      | Wait for other jobs, or scale the cluster                                                                       |
+| `0/N nodes are available: N node(s) didn't match Pod's node affinity/selector` | Affinity/selector too strict                                     | Fix `nodeSelector` / `nodeAffinity` in the pod spec                                                             |
+| `0/N nodes are available: N node(s) had untolerated taint`                     | Taints on HyperPod nodes (common: `hyperpod.amazonaws.com/node`) | Add matching `tolerations` to the pod spec                                                                      |
+| `failed to create pod sandbox: ... CNI`                                        | VPC CNI problem                                                  | Delegate to `hyperpod-node-debugger` § O                                                                        |
+| `MountVolume.SetUp failed for volume`                                          | PVC binding issue                                                | Check PVC status, StorageClass, EBS/FSx availability                                                            |
+| `ImagePullBackOff` / `ErrImagePull`                                            | Container image pull failed                                      | Check ECR pull permissions on the node role; check image URI; confirm VPC endpoint for ECR if in private subnet |
+| (no events; just stuck)                                                        | Scheduler starved or no matching pool                            | `kubectl get events -A --sort-by=.lastTimestamp \| tail -50` for cluster-wide scheduler state                   |
+
+### CrashLoopBackOff
+
+```bash
+kubectl logs <POD> -n <NS> --previous | tail -100   # logs from the crashed container
+kubectl describe pod <POD> -n <NS>                   # last termination state + exit code
+```
+
+Map the exit code to the guide section:
+
+| Exit code       | Meaning                                       | Section                                                |
+| --------------- | --------------------------------------------- | ------------------------------------------------------ |
+| 137 (OOMKilled) | Container OOM                                 | § 4 Container OOM                                      |
+| 143 (SIGTERM)   | Liveness probe failed or graceful termination | Check liveness probe; check preceding SIGTERM in logs  |
+| 139 (SIGSEGV)   | Segfault — often CUDA / driver mismatch       | § 10 NCCL Version Mismatch                             |
+| 1 / 2 / other   | Application error                             | Read `kubectl logs --previous` for the app-level error |
+
+### Stuck in init container
+
+```bash
+kubectl get pod <POD> -n <NS> -o jsonpath='{.status.initContainerStatuses}' | python3 -m json.tool
+kubectl logs <POD> -n <NS> -c <INIT_CONTAINER_NAME>
+```
+
+Common init-container failures:
+
+- Fetching model weights from S3 — check IAM, VPC endpoint, bucket policy.
+- Downloading dataset — DNS / network / auth.
+- Running a `chown`/`chmod` on a large volume — timeout.
+- Waiting for another pod (headless service / init-container-as-gate pattern) — the dependency pod never became Ready.
+
+### Remediation is always customer-driven
+
+None of these states have a one-command fix. Walk the customer through the diagnosis above, identify the specific cause, then apply the targeted fix. Do not `kubectl delete` pods without understanding why.
+
+---
+
+## 21. GPU Row-Remap / DCGM Health (Marginal Memory Silent Degrader)
+
+### When this applies
+
+NCCL aborts or training accuracy regressions that aren't explained by Xid or ECC counts. Classic pattern: sporadic NaNs, intermittent AllReduce hangs, DCGM default `medium,memtest` passes, but one or more GPUs are silently returning bad data.
+
+### Root cause
+
+- **Pending row-remaps**: H100 / A100 GPUs stage a memory-row replacement when marginal memory is detected. The remap is **pending** until a GPU reset or reboot finalizes it. While pending, the memory is still marginal — training drifts.
+- **Stuck pending**: A known firmware edge case keeps the remap in "Pending" across reboots instead of promoting to "Failure". The GPU is effectively failing but neither `nvidia-smi` nor a DCGM health check flags it.
+- **DCGM `medium,memtest` bug (≤ 3.3.9)**: Combined-run `dcgmi diag -r medium,memtest` can pass even when `memtest` alone would fail. The diagnostic bucket that covers row-remap in memtest gets masked.
+
+### Diagnose
+
+```bash
+# Query remap state on every GPU in the node:
+nvidia-smi --query-remapped-rows=gpu_bus_id,remapped_rows.correctable,remapped_rows.uncorrectable,remapped_rows.pending,remapped_rows.failure \
+  --format=csv
+
+# DCGM health (JSON form is parseable):
+dcgmi health --check -j
+
+# Work around the medium,memtest combined-run bug by splitting:
+dcgmi diag -r medium
+dcgmi diag -r memtest
+
+# Latest DCGM validation-suite log:
+ls -1t /var/log/nvidia-dcgm/ | head
+tail -n 200 "$(ls -1t /var/log/nvidia-dcgm/nvvs*.log 2>/dev/null | head -1)"
+```
+
+### Remediation by state
+
+| Remap state                          | Action                                                                                                                                                                          |
+| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `pending = 0`, `failure = No`        | Healthy. If symptoms persist, look elsewhere (§ 4 OOM, § 10 version mismatch, § 17 RDMA, § 19 topology).                                                                        |
+| `pending > 0`                        | Reboot the node via `batch-reboot-cluster-nodes`. Re-check; pending should drop to 0.                                                                                           |
+| `pending > 0` persists across reboot | Firmware edge case. Drain and replace the node via `batch-replace-cluster-nodes`; open an AWS Support case with `nvidia-bug-report.sh` output + `/var/log/nvidia-dcgm/` bundle. |
+| `failure = Yes`                      | GPU has exceeded remap capacity. Drain and replace the node.                                                                                                                    |
+
+### Data to collect before escalating
+
+```bash
+# Authoritative NVIDIA diagnostic bundle:
+sudo nvidia-bug-report.sh   # produces nvidia-bug-report.log.gz
+
+# Full DCGM log set:
+sudo tar -czf /tmp/nvidia-dcgm-logs.tgz /var/log/nvidia-dcgm/
+```
+
+Attach both plus the `nccl-diagnose.sh` output and the `nvidia-smi --query-remapped-rows` CSV to the AWS Support case. Row-remap data is the single most useful signal for diagnosing these silent failures.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-nccl/references/error-patterns-quick-ref.md b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/error-patterns-quick-ref.md
new file mode 100644
index 00000000..15d76b8b
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/error-patterns-quick-ref.md
@@ -0,0 +1,121 @@
+# NCCL Error Pattern Reference & Common Fixes
+
+Quick-reference for NCCL error patterns detected by the diagnostic script and
+common issues with immediate fixes.
+
+---
+
+## NCCL Error Pattern Reference (38 patterns)
+
+| Log Pattern                                | Code                    | Root Cause                        | Fix                                     |
+| ------------------------------------------ | ----------------------- | --------------------------------- | --------------------------------------- |
+| **Rendezvous / Connection**                |                         |                                   |                                         |
+| `Timeout waiting for`                      | `TIMEOUT_RENDEZVOUS`    | Peers not joining init            | SG self-ref, NetworkPolicy, MASTER_ADDR |
+| `Connection refused`                       | `CONN_REFUSED`          | Rank-0 not listening              | Fix MASTER_ADDR + headless service      |
+| `Address already in use`                   | `PORT_CONFLICT`         | Port 29500 bound                  | Change MASTER_PORT to 29501             |
+| `NCCL WARN Connect to`                     | `CONNECT_FAIL`          | NCCL peer blocked                 | SG self-ref + NetworkPolicy             |
+| `network is unreachable`                   | `NET_UNREACHABLE`       | No route to MASTER_ADDR           | DNS + VPC routing + SG                  |
+| `Error in Store` / `DistStoreError`        | `STORE_ERR`             | c10d rendezvous timeout           | Fix network first                       |
+| `RendezvousConnectionError`                | `RDZV_CONN_ERR`         | Elastic rendezvous failed         | MASTER_ADDR DNS + SG                    |
+| `RendezvousTimeout`                        | `RDZV_TIMEOUT`          | Elastic rendezvous timed out      | Peers not reachable                     |
+| `Name or service not known`                | `DNS_FAIL`              | DNS resolution failed             | Create headless service                 |
+| `getaddrinfo failed`                       | `DNS_FAIL`              | DNS resolution failed             | CoreDNS + headless service              |
+| **Runtime / AllReduce**                    |                         |                                   |                                         |
+| `Watchdog timeout`                         | `WATCHDOG_TIMEOUT`      | AllReduce timed out               | Increase NCCL_TIMEOUT; find straggler   |
+| `unhandled system error`                   | `SYSTEM_ERROR`          | GPU/EFA hardware                  | SSM: dmesg XID errors; reboot node      |
+| `unhandled cuda error`                     | `CUDA_ERROR`            | CUDA runtime error                | GPU driver crash or hardware fault      |
+| `peer access is not supported`             | `P2P_FAIL`              | GPU P2P blocked by ACS/IOMMU      | Disable ACS; check IOMMU                |
+| `NCCL WARN Cuda failure`                   | `CUDA_ERROR`            | CUDA failure inside NCCL          | GPU hardware or driver issue            |
+| **EFA / Libfabric**                        |                         |                                   |                                         |
+| `fi_getinfo failed`                        | `EFA_INIT_FAIL`         | EFA not available                 | Fix EFA; use gloo on non-EFA            |
+| `NCCL_OFI_RDMA`                            | `OFI_ERROR`             | aws-ofi-nccl broken               | Check plugin + EFA version              |
+| `Call to ibv_reg_mr failed`                | `RDMA_REG_FAIL`         | memlock=0 blocks EFA RDMA         | `ulimit -l 8388608`                     |
+| `NET/OFI Using TCP`                        | `EFA_TCP_FALLBACK`      | Fell back to TCP                  | Fix EFA device plugin + env             |
+| `Failed to load NCCL`                      | `NCCL_LOAD_FAIL`        | libnccl.so missing                | Check LD_LIBRARY_PATH                   |
+| `libnccl-net.so`                           | `OFI_LOAD_FAIL`         | OFI plugin missing                | Install aws-ofi-nccl                    |
+| **OOM / Resource Limits**                  |                         |                                   |                                         |
+| `OOMKilled`                                | `OOM_KILL`              | Pod out of memory                 | Reduce batch size; increase limits      |
+| `CUDA out of memory` / `cudaMalloc failed` | `CUDA_OOM`              | GPU VRAM exhausted                | Reduce batch size, enable ZeRO          |
+| `failed to extend /dev/shm` / `Bus error`  | `SHM_FULL`              | /dev/shm too small                | emptyDir medium:Memory 10Gi             |
+| **Version / Config**                       |                         |                                   |                                         |
+| `NCCL function not found`                  | `NCCL_VERSION_MISMATCH` | Mixed NCCL versions               | Use identical container images          |
+| `Incompatible NCCL version`                | `NCCL_VERSION_MISMATCH` | Mixed NCCL versions               | Use identical container images          |
+| `Could not find interface`                 | `IFACE_NOT_FOUND`       | Bad NCCL_SOCKET_IFNAME            | Set `^lo,docker,efa,veth,virbr`         |
+| `world_size mismatch`                      | `WORLD_SIZE_MISMATCH`   | WORLD_SIZE ≠ ranks                | WORLD_SIZE = pods × GPUs/pod            |
+| `doesn't have NCCL built in`               | `NCCL_NOT_BUILT`        | PyTorch without NCCL              | Use AWS DLC image                       |
+| `CUDA_VISIBLE_DEVICES`                     | `CUDA_VIS_DEV`          | GPUs hidden from training         | Remove CUDA_VISIBLE_DEVICES             |
+| **Stale State**                            |                         |                                   |                                         |
+| `unlink shared memory`                     | `SHM_STALE`             | Stale /dev/shm/nccl-* files       | Set RemoveIPC=no; clean up              |
+| **Additional Critical**                    |                         |                                   |                                         |
+| `Call to ncclCommAbort`                    | `NCCL_COMM_ABORT`       | Communicator aborted              | Check for straggler or hardware fault   |
+| `MNNVL topology`                           | `MNNVL_TOPO_FAIL`       | Topology search stack overflow    | `ulimit -l 8388608 -s 8192`             |
+| `ENOMEM`                                   | `ENOMEM`                | Memory alloc/registration failure | Check memlock limits + GPU memory       |
+| `invalid alignment`                        | `CUDA_ALIGN_ERR`        | CUDA memory alignment error       | Check driver/NCCL version compat        |
+
+---
+
+## Common Issues & Immediate Fixes
+
+### Training hangs immediately (no error, just timeout)
+
+**Most likely: Security Group missing self-reference rule.**
+
+```bash
+bash scripts/nccl-diagnose.sh --cluster <NAME> --region <REGION>
+# Look for: [FAIL] Inbound self-reference: MISSING
+```
+
+For remediation, see `operations.md § Security Groups`.
+
+### NCCL picks wrong interface (hangs silently)
+
+```bash
+# Add to job env:
+NCCL_SOCKET_IFNAME=^lo,docker,efa,veth,virbr
+FI_PROVIDER=efa
+FI_EFA_USE_DEVICE_RDMA=1
+```
+
+### `failed to extend /dev/shm/nccl-*` / Bus error
+
+```yaml
+volumes:
+- name: dshm
+  emptyDir: {medium: Memory, sizeLimit: "10Gi"}
+volumeMounts:
+- name: dshm
+  mountPath: /dev/shm
+```
+
+### Slurm node down after training crash
+
+```bash
+bash scripts/nccl-diagnose.sh --cluster <NAME> --region <REGION> --orchestrator slurm
+```
+
+For remediation, see `operations.md § Slurm Node Management`.
+
+### Timeout on large clusters (64+ nodes)
+
+```bash
+export NCCL_TIMEOUT=$(( NODE_COUNT * 5 + 600 ))
+# 64 nodes → 920s, 128 nodes → 1240s, 256 nodes → 1880s
+```
+
+### `Call to ibv_reg_mr failed` — EFA RDMA fails
+
+```bash
+# On node via SSM:
+ulimit -l  # should be unlimited or ≥8388608
+echo "* soft memlock 8388608" >> /etc/security/limits.conf
+echo "* hard memlock 8388608" >> /etc/security/limits.conf
+```
+
+### Training slow — suspected straggler node
+
+```bash
+bash scripts/nccl-diagnose.sh --cluster <NAME> --region <REGION> --sample-nodes 20
+# Look for: [WARN] fi_ping latency >20us, [FAIL] NVLink errors, [FAIL] GPU XID ERRORS
+# Then replace: aws sagemaker batch-replace-cluster-nodes --cluster-name <NAME> \
+#   --region <REGION> --node-ids '["<INSTANCE_ID>"]'
+```
diff --git a/plugins/sagemaker-ai/skills/hyperpod-nccl/references/operations.md b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/operations.md
new file mode 100644
index 00000000..a4384972
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/operations.md
@@ -0,0 +1,643 @@
+# NCCL HyperPod — Operations Reference
+
+Complete operational guide for running the NCCL diagnostic skill on SageMaker HyperPod clusters.
+Covers setup, IAM, SSM, CloudWatch, common environments, and decision trees.
+
+---
+
+## 1. Pre-Flight Checklist
+
+Before running `nccl-diagnose.sh`, verify all of these:
+
+```
+[ ] aws CLI configured and authenticated
+      aws sts get-caller-identity  # must return your account + role
+
+[ ] For EKS: kubectl authenticated to the right cluster
+      kubectl get nodes            # must list cluster nodes — not an empty list
+
+[ ] jq installed
+      jq --version
+
+[ ] python3 installed
+      python3 --version
+
+[ ] (Optional) SSM Session Manager plugin installed — required for hardware checks
+      session-manager-plugin --version
+      # Install: https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html
+```
+
+---
+
+## 2. Getting Cluster Names
+
+**The HyperPod cluster name ≠ EKS cluster name.** You need both for EKS clusters.
+
+```bash
+# List all HyperPod clusters in a region:
+aws sagemaker list-clusters --region <REGION> \
+  --query 'ClusterSummaries[*].[ClusterName,ClusterStatus,CreationTime]' \
+  --output table
+
+# Get the EKS cluster name from a HyperPod cluster:
+EKS_ARN=$(aws sagemaker describe-cluster \
+  --cluster-name <HYPERPOD-NAME> --region <REGION> \
+  --query 'Orchestrator.Eks.ClusterArn' --output text)
+EKS_NAME=$(echo $EKS_ARN | awk -F'/' '{print $NF}')
+echo "EKS cluster: $EKS_NAME"
+
+# Update kubeconfig:
+aws eks update-kubeconfig --name $EKS_NAME --region <REGION>
+```
+
+---
+
+## 3. IAM Permissions
+
+### Read-only diagnostic
+
+The diagnostic script never modifies state. It needs only read permissions plus SSM to run commands on compute nodes for hardware checks.
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Sid": "NCCLSkillReadOnly",
+      "Effect": "Allow",
+      "Action": [
+        "sagemaker:DescribeCluster",
+        "sagemaker:ListClusters",
+        "sagemaker:ListClusterNodes",
+        "sagemaker:ListClusterEvents",
+        "ec2:DescribeSecurityGroups",
+        "ec2:DescribeVpcs",
+        "ec2:DescribeSubnets",
+        "ec2:DescribeInstances",
+        "logs:DescribeLogGroups",
+        "logs:DescribeLogStreams",
+        "logs:FilterLogEvents",
+        "logs:GetLogEvents",
+        "ssm:StartSession",
+        "ssm:DescribeSessions",
+        "ssm:TerminateSession"
+      ],
+      "Resource": "*"
+    }
+  ]
+}
+```
+
+### Per-remediation permissions (only if the operator applies the suggested fix)
+
+The script prints commands; the operator runs them with whatever role is already authorized. For each class of remediation the script suggests, the caller needs:
+
+| Suggested command                                   | Required action                                |
+| --------------------------------------------------- | ---------------------------------------------- |
+| `aws ec2 authorize-security-group-{ingress,egress}` | `ec2:AuthorizeSecurityGroupIngress` / `Egress` |
+| `aws sagemaker batch-reboot-cluster-nodes`          | `sagemaker:BatchRebootClusterNodes`            |
+| `aws sagemaker batch-replace-cluster-nodes`         | `sagemaker:BatchReplaceClusterNodes`           |
+| `aws eks update-kubeconfig`                         | `eks:DescribeCluster`                          |
+| `kubectl delete/create networkpolicy`               | EKS access entry + RBAC on `networkpolicies`   |
+
+### kubectl RBAC (EKS — read for diagnostic, write only if the operator applies a fix)
+
+```yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: nccl-skill-read
+rules:
+- apiGroups: [""]
+  resources: ["nodes", "pods", "pods/log", "namespaces", "services"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources: ["pods/exec"]
+  verbs: ["create"]
+- apiGroups: ["networking.k8s.io"]
+  resources: ["networkpolicies"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: ["apps"]
+  resources: ["daemonsets"]
+  verbs: ["get", "list"]
+- apiGroups: ["batch"]
+  resources: ["jobs"]
+  verbs: ["get", "list"]
+```
+
+If the operator needs to delete/create a NetworkPolicy as part of remediation, grant `delete`/`create` on the `networkpolicies` resource — scope it to the training namespace rather than cluster-wide.
+
+---
+
+## 4. SSM Setup for Node Hardware Checks
+
+SSM allows the skill to run diagnostics directly on compute nodes without SSH.
+
+### Install Session Manager Plugin (local machine)
+
+```bash
+# Linux (Amazon Linux / RHEL / CentOS):
+curl "https://s3.amazonaws.com/session-manager-downloads/plugin/latest/linux_64bit/session-manager-plugin.rpm" \
+  -o /tmp/session-manager-plugin.rpm
+sudo yum install -y /tmp/session-manager-plugin.rpm
+
+# Ubuntu/Debian:
+curl "https://s3.amazonaws.com/session-manager-downloads/plugin/latest/ubuntu_64bit/session-manager-plugin.deb" \
+  -o /tmp/session-manager-plugin.deb
+sudo dpkg -i /tmp/session-manager-plugin.deb
+
+# macOS:
+brew install --cask session-manager-plugin
+
+# Verify:
+session-manager-plugin --version
+```
+
+### Verify SSM agent on cluster nodes
+
+HyperPod nodes have SSM agent pre-installed. Verify it's running:
+
+```bash
+# Test manually:
+CLUSTER_ID=$(aws sagemaker describe-cluster \
+  --cluster-name <HYPERPOD-NAME> --region <REGION> \
+  --query 'ClusterArn' --output text | awk -F'/' '{print $NF}')
+
+INSTANCE_ID=$(aws sagemaker list-cluster-nodes \
+  --cluster-name <HYPERPOD-NAME> --region <REGION> \
+  --query 'ClusterNodeSummaries[0].InstanceId' --output text)
+
+GROUP=$(aws sagemaker list-cluster-nodes \
+  --cluster-name <HYPERPOD-NAME> --region <REGION> \
+  --query 'ClusterNodeSummaries[0].InstanceGroupName' --output text)
+
+# SSM target format for HyperPod:
+TARGET="sagemaker-cluster:${CLUSTER_ID}_${GROUP}-${INSTANCE_ID}"
+
+aws ssm start-session --target "$TARGET" --region <REGION>
+# Should open a shell on the compute node
+```
+
+---
+
+## 5. CloudWatch Log Setup
+
+CloudWatch is the fallback when kubectl is unavailable (e.g., running from outside VPC).
+Also used for Slurm NCCL log analysis at scale.
+
+### Enable CloudWatch on HyperPod cluster
+
+Add to your lifecycle script (`/opt/ml/scripts/on_create.sh` or lifecycle config):
+
+```bash
+# Install CloudWatch agent:
+yum install -y amazon-cloudwatch-agent  # Amazon Linux
+# or
+apt-get install -y amazon-cloudwatch-agent  # Ubuntu
+
+# Configure for NCCL logs:
+cat > /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json << 'EOF'
+{
+  "logs": {
+    "logs_collected": {
+      "files": {
+        "collect_list": [
+          {
+            "file_path": "/var/log/nccl.log",
+            "log_group_name": "/aws/sagemaker/Clusters/${CLUSTER_NAME}/${CLUSTER_ID}",
+            "log_stream_name": "{instance_id}/nccl"
+          },
+          {
+            "file_path": "/var/log/training/*.log",
+            "log_group_name": "/aws/sagemaker/Clusters/${CLUSTER_NAME}/${CLUSTER_ID}",
+            "log_stream_name": "{instance_id}/training"
+          }
+        ]
+      }
+    }
+  }
+}
+EOF
+
+/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl \
+  -a fetch-config -m ec2 \
+  -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s
+```
+
+### Query CloudWatch manually
+
+```bash
+CLUSTER_ID=$(aws sagemaker describe-cluster \
+  --cluster-name <NAME> --region <REGION> \
+  --query 'ClusterArn' --output text | awk -F'/' '{print $NF}')
+
+LOG_GROUP="/aws/sagemaker/Clusters/<NAME>/${CLUSTER_ID}"
+START=$(date -d '2 hours ago' +%s)000  # Linux
+# START=$(date -v-2H +%s)000           # macOS
+
+# Search for NCCL errors:
+aws logs filter-log-events \
+  --log-group-name "$LOG_GROUP" \
+  --filter-pattern '"NCCL WARN"' \
+  --start-time $START \
+  --region <REGION> \
+  --query 'events[*].[timestamp,logStreamName,message]' \
+  --output table
+```
+
+---
+
+## 6. Decision Tree: Which Flags to Use
+
+The script is read-only. It never modifies cluster state. For remediation, read the reference section that each `[FAIL]` points to and apply the fix manually with explicit customer approval.
+
+```
+User reports NCCL issue
+│
+├─ "I don't know what's wrong"
+│   └─ bash nccl-diagnose.sh --cluster <NAME> --region <REGION>
+│
+├─ "Training job <JOB> is hanging in namespace <NS>"
+│   └─ bash nccl-diagnose.sh --cluster <NAME> --region <REGION> \
+│          --namespace <NS> --job <JOB>
+│
+├─ "It's a Slurm cluster"
+│   └─ bash nccl-diagnose.sh --cluster <NAME> --region <REGION> \
+│          --orchestrator slurm
+│
+├─ "One specific node is slow / suspect hardware"
+│   └─ bash nccl-diagnose.sh --cluster <NAME> --region <REGION> \
+│          --node <INSTANCE-ID> --sample-nodes 1
+│
+└─ "Large cluster (50+ nodes), check more hardware"
+    └─ bash nccl-diagnose.sh --cluster <NAME> --region <REGION> \
+           --sample-nodes 10
+```
+
+---
+
+## 7. Environment Variable Quick Reference
+
+### Required for all distributed training
+
+| Variable      | Value                        | Purpose                              |
+| ------------- | ---------------------------- | ------------------------------------ |
+| `MASTER_ADDR` | IP or hostname of rank-0 pod | Rendezvous endpoint                  |
+| `MASTER_PORT` | `29500`                      | Rendezvous port (change if conflict) |
+| `WORLD_SIZE`  | `pods × GPUs_per_pod`        | Total process count                  |
+| `RANK`        | `0` to `WORLD_SIZE-1`        | This process's global rank           |
+| `LOCAL_RANK`  | `0` to `GPUs_per_pod-1`      | This process's local rank            |
+
+### For EFA instances (p4d, p5, p3dn)
+
+| Variable                 | Value                                     | Purpose                                       |
+| ------------------------ | ----------------------------------------- | --------------------------------------------- |
+| `NCCL_SOCKET_IFNAME`     | `^lo,docker,efa,veth,virbr`               | Exclude non-VPC interfaces                    |
+| `FI_PROVIDER`            | `efa`                                     | Use EFA libfabric provider                    |
+| `FI_EFA_USE_DEVICE_RDMA` | `1`                                       | Enable EFA RDMA (required for full bandwidth) |
+| `FI_EFA_FORK_SAFE`       | `1`                                       | Required when using Python multiprocessing    |
+| `NCCL_NET_PLUGIN`        | `/opt/amazon/ofi-nccl/lib/libnccl-net.so` | Explicit OFI plugin path                      |
+| `NCCL_TIMEOUT`           | `1800` or `$(( nodes*5+600 ))`            | Increase for large clusters                   |
+
+### Performance tuning
+
+| Variable                  | Value     | Purpose                                       |
+| ------------------------- | --------- | --------------------------------------------- |
+| `NCCL_DEBUG`              | `WARN`    | Production (not INFO/TRACE — 10-80% overhead) |
+| `NCCL_BUFFSIZE`           | `8388608` | Larger buffer for 100Gbps+ networks           |
+| `NCCL_P2P_LEVEL`          | `NVL`     | Force NVLink P2P (avoids PCIe on multi-GPU)   |
+| `TORCH_DISTRIBUTED_DEBUG` | `DETAIL`  | PyTorch detailed distributed debug (dev only) |
+| `NCCL_CUMEM_HOST_ENABLE`  | `0`       | Workaround for NUMA cuMem errors              |
+| `NCCL_IB_DISABLE`         | `1`       | Force TCP (for non-IB/EFA clusters)           |
+
+### K8s pod spec template (EFA-enabled)
+
+```yaml
+env:
+- name: MASTER_ADDR
+  value: "my-job-svc.my-namespace.svc.cluster.local"
+- name: MASTER_PORT
+  value: "29500"
+- name: WORLD_SIZE
+  value: "16"        # 2 nodes × 8 GPUs
+- name: NCCL_SOCKET_IFNAME
+  value: "^lo,docker,efa,veth,virbr"
+- name: FI_PROVIDER
+  value: "efa"
+- name: FI_EFA_USE_DEVICE_RDMA
+  value: "1"
+- name: FI_EFA_FORK_SAFE
+  value: "1"
+- name: NCCL_DEBUG
+  value: "WARN"
+- name: NCCL_TIMEOUT
+  value: "1800"
+resources:
+  limits:
+    nvidia.com/gpu: 8
+    vpc.amazonaws.com/efa: 4     # p4d=4, p5=32
+  requests:
+    nvidia.com/gpu: 8
+    vpc.amazonaws.com/efa: 4
+volumes:
+- name: dshm
+  emptyDir:
+    medium: Memory
+    sizeLimit: "10Gi"
+volumeMounts:
+- name: dshm
+  mountPath: /dev/shm
+```
+
+---
+
+## 8. HyperPod Node Health Labels Reference
+
+The skill checks these labels on every K8s node:
+
+| Label                                              | Value                             | Meaning                                                    |
+| -------------------------------------------------- | --------------------------------- | ---------------------------------------------------------- |
+| `sagemaker.amazonaws.com/node-health-status`       | `Schedulable`                     | Node healthy, accepts pods                                 |
+|                                                    | `Unschedulable`                   | Running deep health checks (~2h), temporarily unavailable  |
+|                                                    | `UnschedulablePendingReplacement` | Failed health checks — NodeRecovery=Automatic will replace |
+|                                                    | `UnschedulablePendingReboot`      | Rebooting to re-run health checks                          |
+| `sagemaker.amazonaws.com/deep-health-check-status` | `Passed`                          | Deep health check succeeded                                |
+|                                                    | `Failed`                          | Deep health check FAILED — node will be replaced           |
+|                                                    | `InProgress`                      | Deep health check running                                  |
+| `sagemaker.amazonaws.com/fault-type`               | (any value)                       | Hardware fault type detected                               |
+| `sagemaker.amazonaws.com/fault-reason`             | (any value)                       | Hardware fault reason                                      |
+
+**NodeRecovery modes:**
+
+- `Automatic` — HyperPod automatically replaces failed nodes (recommended)
+- `None` — failed nodes stay down; manual intervention required
+
+> NodeRecovery is set per instance group and can be toggled via `aws sagemaker update-cluster --instance-groups '[{"InstanceGroupName":"<group>","OnStart…,"ExecutionRole":…,"InstanceCount":…,"InstanceType":…,"LifeCycleConfig":…, "NodeRecovery":"Automatic"}]'`. Note: `update-cluster` replaces the entire instance-group spec — fetch the current spec via `describe-cluster`, edit only the `NodeRecovery` field, and push back.
+
+---
+
+## 9. Slurm-Specific Operations
+
+### Check cluster state
+
+```bash
+sinfo -o "%10N %10T %10C %30E" --noheader   # nodes, state, CPUs, reason
+squeue -o "%10i %20j %8T %12R %N" --noheader  # jobs, state, reason, nodelist
+scontrol show job <JOBID>                       # detailed job info
+scontrol show node <NODE>                       # detailed node info
+```
+
+### Common Slurm NCCL fixes
+
+```bash
+# Resume drained/down node:
+scontrol update nodename=<NODE> state=resume
+
+# Drain node for maintenance:
+scontrol update nodename=<NODE> state=drain reason="hardware-check"
+
+# Cancel stuck job:
+scancel <JOBID>
+
+# Check NCCL-relevant Slurm config:
+scontrol show config | grep -E "SlurmdTimeout|KillWait|MessageTimeout|TaskPlugin"
+```
+
+### Slurm prolog/epilog for NCCL setup
+
+Add to `/etc/slurm/prolog.sh`:
+
+```bash
+#!/bin/bash
+# Set NCCL env for all jobs
+export NCCL_SOCKET_IFNAME=^lo,docker
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1
+export NCCL_TIMEOUT=1800
+# Ensure /dev/shm is large enough
+mount -o remount,size=10G /dev/shm 2>/dev/null || true
+```
+
+---
+
+## 10. Troubleshooting the Skill Itself
+
+### kubectl not authenticating
+
+```bash
+# Symptom: [FAIL] kubectl NOT authenticated to EKS cluster
+# Fix:
+aws eks update-kubeconfig --name <EKS-CLUSTER-NAME> --region <REGION>
+# Verify: kubectl get nodes
+```
+
+### SSM connection fails
+
+```bash
+# Symptom: [WARN] SSM connection failed for i-0abc123
+# Check 1: SSM plugin installed?
+session-manager-plugin --version
+
+# Check 2: Node SSM agent running?
+aws ssm describe-instance-information --region <REGION> \
+  --filters Key=InstanceIds,Values=<INSTANCE-ID> \
+  --query 'InstanceInformationList[0].PingStatus'
+# Should return "Online"
+
+# Check 3: IAM allows ssm:StartSession?
+aws iam simulate-principal-policy \
+  --policy-source-arn <YOUR-ROLE-ARN> \
+  --action-names ssm:StartSession \
+  --resource-arns "*" \
+  --query 'EvaluationResults[0].EvalDecision'
+# Should return "allowed"
+```
+
+### AWS API returns empty (cluster not found)
+
+```bash
+# Verify cluster exists:
+aws sagemaker list-clusters --region <REGION> \
+  --query 'ClusterSummaries[*].ClusterName' --output text
+
+# Common mistake: using EKS cluster name instead of HyperPod name
+# HyperPod name is the name you gave when calling create-cluster
+```
+
+### Cluster status shows UNKNOWN
+
+```bash
+# Usually means: IAM lacks sagemaker:DescribeCluster
+# Test:
+aws sagemaker describe-cluster \
+  --cluster-name <NAME> --region <REGION> \
+  --query 'ClusterStatus' --output text
+# If AccessDenied: add sagemaker:DescribeCluster to IAM policy
+```
+
+---
+
+## 11. Remediation Runbooks
+
+The diagnostic script does not print remediation commands — it prints findings with pointers into this section. For each class of issue the script can detect, the subsection below gives the full runbook: root cause, preconditions, exact commands, verification, and blast radius.
+
+### Security Groups
+
+**Detected when:** `[FAIL] SG sg-xxx missing inbound/outbound self-reference` — NCCL inter-node rendezvous or EFA traffic fails.
+
+**Root cause:** EFA requires the instance security group to reference itself with `AllTraffic (-1)` on both inbound and outbound. Without this rule, NCCL packets between nodes are dropped.
+
+**Remediation:**
+
+```bash
+# Inbound self-ref (required for NCCL rendezvous):
+aws ec2 authorize-security-group-ingress --group-id <SG> --region <REGION> \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"<SG>"}]}]'
+
+# Outbound self-ref (required for EFA traffic):
+aws ec2 authorize-security-group-egress --group-id <SG> --region <REGION> \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"<SG>"}]}]'
+```
+
+The call is idempotent — if the rule already exists, AWS returns `InvalidPermission.Duplicate`; treat that as success.
+
+**Verify:**
+
+```bash
+aws ec2 describe-security-groups --group-ids <SG> --region <REGION> \
+  --query 'SecurityGroups[0].{In:IpPermissions,Out:IpPermissionsEgress}'
+```
+
+Re-run `nccl-diagnose.sh`; the corresponding `[FAIL]` should flip to `[PASS]`.
+
+**Preconditions:** Caller needs `ec2:AuthorizeSecurityGroupIngress` / `Egress`. Schedule during a training pause if possible; rule propagation is seconds but active NCCL rendezvous may already be timing out.
+
+### NetworkPolicy
+
+**Detected when:** `[WARN] NetworkPolicies found in <ns>` together with a `[FAIL]` indicating blocked NCCL traffic.
+
+**Before deleting any NetworkPolicy:** read it — the policy may be intentional tenant isolation or compliance-required. Confirm with the customer.
+
+```bash
+# Inspect all NetworkPolicies in the training namespace:
+kubectl get networkpolicy -n <NS> -o yaml
+```
+
+**Replace with an allow-all intra-namespace policy (only for NCCL training namespaces):**
+
+```yaml
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-nccl-intranamespace
+  namespace: <NS>
+spec:
+  podSelector: {}
+  policyTypes: ["Ingress", "Egress"]
+  ingress:
+    - from:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: <NS>
+  egress:
+    - to:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: <NS>
+    - ports:
+        - port: 53
+          protocol: UDP
+```
+
+Apply with `kubectl apply -f <file>.yaml`. Delete a blocking policy only after the customer confirms it is not load-bearing:
+
+```bash
+kubectl delete networkpolicy <NAME> -n <NS>
+```
+
+**Verify:** rerun `nccl-diagnose.sh`; the `[WARN] NetworkPolicies found` should go away for the training namespace, and blocked-communication `[FAIL]` signals should clear.
+
+### Slurm Node Management
+
+**Detected when:** `[FAIL] Slurm nodes DOWN/DRAINING: <nodelist>`.
+
+**Never resume a node without first understanding why it went down.** Resuming a node that failed due to hardware error, OOM, or NCCL crash will just repeat the failure.
+
+```bash
+# On the controller:
+sinfo -R                     # reason for each drained node
+scontrol show node <NODE>    # full state
+```
+
+If the reason is something transient (e.g. a one-off user job error) and the underlying fault is resolved:
+
+```bash
+scontrol update nodename=<NODE> state=resume
+```
+
+If the node was drained due to hardware fault, go to [Node Reboot & Replacement](#node-reboot--replacement) first.
+
+**Verify:**
+
+```bash
+sinfo -o '%N %T' --noheader | grep <NODE>   # should show idle/alloc/mixed
+```
+
+### Node Reboot & Replacement
+
+**Detected when:** `[FAIL]` lines mentioning `SYSTEM_ERROR`, GPU XID errors, or `EFA provider failure`. These indicate a hardware or driver fault on a specific node.
+
+**Start with the least-destructive action.**
+
+1. **Verify the fault is real** (via SSM into the affected instance). HyperPod nodes are only reachable through `start-session` against a `sagemaker-cluster:` target — `ssm send-command` against a bare instance ID will fail with `ValidationException`:
+
+   ```bash
+   # Resolve CLUSTER_ID (ARN suffix) and the instance group once:
+   CLUSTER_ID=$(aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+     --query 'ClusterArn' --output text | awk -F/ '{print $2}')
+   GROUP=$(aws sagemaker list-cluster-nodes --cluster-name <CLUSTER> --region <REGION> \
+     --query "ClusterNodeSummaries[?InstanceId=='<IID>'].InstanceGroupName" --output text)
+
+   # Non-interactive remote command via SSM:
+   cat > /tmp/fault-check.json <<'EOF'
+   {"command":["dmesg | grep -iE 'xid|nvrm' | tail -20; fi_info -p efa 2>&1 | head -20; nvidia-smi -q | grep -A3 'Xid|ECC'"]}
+   EOF
+   aws ssm start-session \
+     --target "sagemaker-cluster:${CLUSTER_ID}_${GROUP}-<IID>" \
+     --document-name AWS-StartNonInteractiveCommand \
+     --parameters file:///tmp/fault-check.json \
+     --region <REGION>
+   ```
+
+2. **Reboot first** — reboot clears most transient GPU/EFA faults without destroying data:
+
+   ```bash
+   aws sagemaker batch-reboot-cluster-nodes \
+     --cluster-name <CLUSTER> --region <REGION> \
+     --node-ids '["<IID>"]'
+   ```
+
+3. **Replace only if reboot does not clear the fault.** Replacement destroys all instance volumes — back up any state to S3 or FSx first:
+
+   ```bash
+   aws sagemaker batch-replace-cluster-nodes \
+     --cluster-name <CLUSTER> --region <REGION> \
+     --node-ids '["<IID>"]'
+   ```
+
+   HyperPod replaces up to 25 node IDs per call.
+
+**Dedup rule:** If multiple pods on the same node reported `SYSTEM_ERROR`, reboot the instance once, not once per pod.
+
+**Verify:**
+
+```bash
+# After reboot, wait for node to return to Running:
+aws sagemaker describe-cluster-node --cluster-name <CLUSTER> \
+  --node-id <IID> --region <REGION> \
+  --query 'NodeDetails.InstanceStatus'
+
+# Then rerun the diagnostic — the SYSTEM_ERROR signals should be gone.
+```
+
+**Blast radius:** reboot interrupts every workload on that node; replacement destroys instance-local storage. Always get explicit customer approval before running either.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-nccl/references/performance-testing.md b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/performance-testing.md
new file mode 100644
index 00000000..59673baf
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/performance-testing.md
@@ -0,0 +1,233 @@
+# NCCL Performance Testing & Straggler Detection
+
+Reference guide for measuring NCCL bandwidth and identifying slow nodes.
+
+Official awslabs guide:
+https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/validation-and-testing/performance-testing/nccl-tests
+
+## Table of Contents
+
+| Section                                                                                  | Use When                                     |
+| ---------------------------------------------------------------------------------------- | -------------------------------------------- |
+| [Install nccl-tests](#install-nccl-tests-once-per-cluster)                               | First-time setup                             |
+| [Single-Node Baseline Test](#single-node-baseline-test)                                  | Verify one node is healthy before multi-node |
+| [Multi-Node AllReduce Test](#multi-node-allreduce-test)                                  | Validate cluster-wide NCCL                   |
+| [Pairwise Bandwidth Test](#pairwise-bandwidth-test-identify-slow-pairs)                  | Find which node pair is slow                 |
+| [NCCL_DEBUG_FILE Analysis](#nccl_debug_file-analysis)                                    | Identify lagging rank from logs              |
+| [NCCL_TIMEOUT Scaling](#nccl_timeout-scaling)                                            | Tune timeout for large clusters              |
+| [NCCL_DEBUG=INFO Performance Impact](#nccl_debuginfo-performance-impact)                 | Debug logging slowing training               |
+| [EFA Performance Settings](#efa-performance-settings)                                    | Optimal EFA env vars                         |
+| [Straggler Node — Detection and Replacement](#straggler-node--detection-and-replacement) | Drain and replace slow node                  |
+
+---
+
+## Install nccl-tests (once per cluster)
+
+```bash
+# On each compute node (add to lifecycle script for persistence):
+git clone https://github.com/NVIDIA/nccl-tests /opt/nccl-tests
+cd /opt/nccl-tests
+make MPI=1 \
+     MPI_HOME=/usr/local/mpi \
+     NCCL_HOME=/usr/local/nccl \
+     CUDA_HOME=/usr/local/cuda
+# Binary: /opt/nccl-tests/build/all_reduce_perf
+```
+
+---
+
+## Single-Node Baseline Test
+
+Run first to confirm the node itself is healthy before multi-node tests.
+
+```bash
+# Single-GPU test (quick sanity check):
+/opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
+
+# All-GPU test (p4d: 8 GPUs, p5: 8 GPUs):
+/opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 8
+
+# Expected output column headers:
+# size  count  type  redop  root  time  algbw  busbw  error  time  algbw  busbw
+```
+
+**Expected minimum bandwidth (algbw column):**
+
+| Instance Type | GPUs         | Expected algbw | EFA Bandwidth |
+| ------------- | ------------ | -------------- | ------------- |
+| p4d.24xlarge  | 8x A100      | ≥ 86 GB/s      | 400 Gbps      |
+| p5.48xlarge   | 8x H100      | ≥ 300 GB/s     | 3200 Gbps     |
+| p3dn.24xlarge | 8x V100      | ≥ 50 GB/s      | 100 Gbps      |
+| trn1.32xlarge | 16x Trainium | ≥ 50 GB/s      | 800 Gbps      |
+| g5.48xlarge   | 8x A10G      | ≥ 10 GB/s      | 100 Gbps      |
+
+If below 50% of expected: node is a straggler → drain and replace.
+
+---
+
+## Multi-Node AllReduce Test
+
+```bash
+# With MPI (from head node):
+mpirun -np <TOTAL_RANKS> \
+  --hostfile /etc/hosts \
+  -N <RANKS_PER_NODE> \
+  -x FI_PROVIDER=efa \
+  -x FI_EFA_USE_DEVICE_RDMA=1 \
+  -x NCCL_SOCKET_IFNAME=^lo,docker,efa,veth \
+  -x NCCL_DEBUG=WARN \
+  /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
+
+# With Slurm:
+srun --nodes=4 --ntasks-per-node=8 \
+  /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
+
+# With kubectl (EKS, 2 nodes, 8 GPUs each):
+# Deploy as a K8s Job with 2 pods, each requesting 8 GPUs.
+# Use mpirun inside the container or the MPI Operator:
+# See: https://github.com/kubeflow/mpi-operator
+kubectl exec -n <NS> <POD> -- mpirun -np 16 -N 8 \
+  --hostfile /etc/hosts \
+  -x FI_PROVIDER=efa -x FI_EFA_USE_DEVICE_RDMA=1 \
+  /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
+```
+
+---
+
+## Pairwise Bandwidth Test (identify slow pairs)
+
+```bash
+# Test each node pair individually to find the outlier:
+# From node A → node B:
+fi_ping -p efa -I 100 <NODE_B_IP>
+
+# From node B → node A:
+fi_ping -p efa -I 100 <NODE_A_IP>
+
+# Automate across all pairs (run on head node):
+for node in $(scontrol show hostnames $SLURM_JOB_NODELIST); do
+    echo -n "Testing $node: "
+    fi_ping -p efa -I 10 "$node" 2>/dev/null | tail -1 || echo "FAILED"
+done
+```
+
+**Interpreting fi_ping output:**
+
+- Normal: < 5 microseconds latency, consistent
+- Straggler: > 50 microseconds, or high variance across runs
+
+---
+
+## NCCL_DEBUG_FILE Analysis
+
+```bash
+# Enable per-rank debug files:
+export NCCL_DEBUG=INFO
+export NCCL_DEBUG_FILE=/tmp/nccl_rank${RANK}.log
+
+# After training (or timeout), check which rank was slow:
+# Look for the last "AllReduce" timestamp before the timeout:
+grep -h "AllReduce\|ring\|timeout" /tmp/nccl_rank*.log | sort -k1,1 | tail -30
+
+# Compare timestamps across ranks — the one furthest behind is the straggler:
+for f in /tmp/nccl_rank*.log; do
+    echo -n "$f: last line timestamp = "
+    tail -1 "$f" | awk '{print $1, $2}'
+done
+```
+
+---
+
+## NCCL_TIMEOUT Scaling
+
+Default `NCCL_TIMEOUT=600` (10 minutes). Too low for large clusters.
+
+| Cluster Size  | Recommended NCCL_TIMEOUT |
+| ------------- | ------------------------ |
+| 2–16 GPUs     | 600s (default)           |
+| 17–64 GPUs    | 1200s                    |
+| 65–256 GPUs   | 1800s                    |
+| 257–1024 GPUs | 3600s                    |
+| 1024+ GPUs    | 7200s                    |
+
+**Formula:** `NCCL_TIMEOUT = node_count * 5 + 600`
+
+```bash
+NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
+export NCCL_TIMEOUT=$(( NODE_COUNT * 5 + 600 ))
+echo "Setting NCCL_TIMEOUT=$NCCL_TIMEOUT for $NODE_COUNT nodes"
+```
+
+---
+
+## NCCL_DEBUG=INFO Performance Impact
+
+**Never leave `NCCL_DEBUG=INFO` in production:**
+
+| Setting                     | Performance Impact                 |
+| --------------------------- | ---------------------------------- |
+| `NCCL_DEBUG=WARN` (default) | 0% overhead                        |
+| `NCCL_DEBUG=INFO`           | 10–30% overhead                    |
+| `NCCL_DEBUG=TRACE`          | 50–80% overhead, gigabytes of logs |
+
+Use `INFO` only for debugging, then set back to `WARN`.
+
+---
+
+## EFA Performance Settings
+
+```bash
+# Full EFA performance configuration:
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1    # GPU Direct RDMA (30-50% faster)
+export NCCL_PROTO=simple           # optimal for EFA (vs tree/ring)
+export NCCL_SOCKET_IFNAME=^lo,docker,efa,veth
+export NCCL_TIMEOUT=1800
+
+# Optional tuning for very large jobs:
+export FI_EFA_FORK_SAFE=1          # safe for multiprocessing
+export FI_EFA_ENABLE_SHM_TRANSFER=1  # intra-node shared memory
+
+# Do NOT set in production:
+# NCCL_DEBUG=INFO  (10-30% overhead)
+# CUDA_LAUNCH_BLOCKING=1  (disables GPU/CPU overlap, very slow)
+```
+
+---
+
+## Straggler Node — Detection and Replacement
+
+### Detection workflow
+
+1. **Run nccl-tests** across all nodes — compare algbw values
+2. **Check nvidia-smi nvlink -e** for NVLink error counters
+3. **Check dmesg** for XID errors, hardware failures
+4. **Compare fi_ping latency** pairwise — outlier has degraded EFA port
+
+### Replacement workflow
+
+```bash
+# 1. Identify the bad node's instance ID:
+kubectl get node <NODE_NAME> -o jsonpath='{.spec.providerID}' | cut -d'/' -f5
+# OR for Slurm:
+aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+  --query "ClusterNodeSummaries[?PrivateDnsHostname=='<SLURM_NODE_NAME>'].InstanceId" \
+  --output text
+
+# 2. Drain it (EKS):
+kubectl cordon <NODE_NAME>
+kubectl drain <NODE_NAME> --ignore-daemonsets --delete-emptydir-data
+
+# 3. Drain it (Slurm):
+scontrol update nodename=<NODE> state=drain reason="low-bandwidth-$(date +%Y%m%d)"
+
+# 4. Replace via HyperPod:
+aws sagemaker batch-replace-cluster-nodes \
+  --cluster-name <C> --region <R> \
+  --node-ids '["<INSTANCE_ID>"]'
+
+# 5. Monitor replacement completion:
+watch -n 10 "aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,State:InstanceStatus.Status}' \
+  --output table"
+```
diff --git a/plugins/sagemaker-ai/skills/hyperpod-nccl/scripts/nccl-diagnose.sh b/plugins/sagemaker-ai/skills/hyperpod-nccl/scripts/nccl-diagnose.sh
new file mode 100755
index 00000000..8962f327
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-nccl/scripts/nccl-diagnose.sh
@@ -0,0 +1,2487 @@
+#!/usr/bin/env bash
+# nccl-diagnose.sh — read-only NCCL diagnostic for SageMaker HyperPod.
+# Supports both EKS and Slurm orchestrators (auto-detected).
+# Hardware checks run on cluster nodes via SSM, not locally.
+#
+# This script never modifies cluster state. It collects diagnostic signals and
+# attaches a reference pointer (→ references/<file>.md § <section>) to each
+# finding. The calling skill (hyperpod-nccl) reads this output alongside the
+# referenced sections to guide the user through the remediation.
+#
+# USAGE:
+#   bash nccl-diagnose.sh [OPTIONS]
+#
+# OPTIONS:
+#   --cluster       <name>        HyperPod cluster name (required)
+#   --region        <region>      AWS region (required)
+#   --orchestrator  <eks|slurm>   Force orchestrator (default: auto-detect)
+#   --namespace     <ns>          [EKS] K8s namespace to scope (default: all)
+#   --job           <job-name>    [EKS] Specific job to diagnose
+#   --node          <instance-id> Specific node instance ID for SSM checks
+#   --sample-nodes  <N>           How many nodes to SSM into (default: 3)
+#   --verbose                     Show extra debug output
+#   --no-color                    Disable ANSI colors (also auto-off when not a TTY)
+#   --help                        Show this help
+#
+# ARCHITECTURE:
+#   LOCAL checks (run on this machine):
+#     - AWS API calls: cluster status, SG rules, cluster events, node list
+#     - kubectl calls: K8s node readiness, pod status, logs, NetworkPolicies
+#   ON-NODE checks (run via SSM on actual cluster compute nodes):
+#     - GPU health (nvidia-smi, XID errors, NVLink)
+#     - EFA / libfabric availability
+#     - NCCL library presence
+#     - Network interfaces and MTU
+#     - Memory / /dev/shm / memlock limits
+#     - Active training processes
+#     - dmesg hardware errors
+#   SCALE strategy for 100s of nodes:
+#     - AWS API checks cover ALL nodes cheaply via list-cluster-nodes
+#     - K8s checks cover ALL nodes cheaply via kubectl
+#     - SSM hardware checks sample --sample-nodes (default: 3) compute nodes
+#     - CloudWatch log analysis covers ALL nodes at scale (no per-node SSM needed)
+#
+# EXAMPLES:
+#   bash nccl-diagnose.sh --cluster my-cluster --region us-east-1
+#   bash nccl-diagnose.sh --cluster my-cluster --region us-east-1 \
+#       --namespace nccl-test --job my-job --sample-nodes 5
+#   bash nccl-diagnose.sh --cluster my-cluster --region us-east-1 \
+#       --node i-0123456789abcdef0
+
+set -euo pipefail
+
+_TEMP_FILES=()
+cleanup() {
+    # Guard against empty-array + set -u on older bash (4.2 on AL2).
+    [[ ${#_TEMP_FILES[@]} -gt 0 ]] && rm -f "${_TEMP_FILES[@]}" 2>/dev/null || true
+}
+trap cleanup EXIT
+
+# Auto-disable colors when stdout is not a TTY or TERM=dumb (agent-piped output).
+if [ -t 1 ] && [ "${TERM:-}" != "dumb" ]; then
+    RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+    BLUE='\033[0;34m'; BOLD='\033[1m'; RESET='\033[0m'
+else
+    RED=''; GREEN=''; YELLOW=''; BLUE=''; BOLD=''; RESET=''
+fi
+
+info()    { echo -e "${BLUE}[INFO]${RESET} $*"; }
+success() { echo -e "${GREEN}[PASS]${RESET} $*"; }
+warn()    { echo -e "${YELLOW}[WARN]${RESET} $*"; }
+error()   { echo -e "${RED}[FAIL]${RESET} $*"; }
+header()  { echo -e "\n${BOLD}${BLUE}═══════════════════════════════════════════════${RESET}"
+            echo -e "${BOLD}${BLUE}  $*${RESET}"
+            echo -e "${BOLD}${BLUE}═══════════════════════════════════════════════${RESET}"; }
+section() { echo -e "\n${BOLD}-- $* --${RESET}"; }
+debug()   { $VERBOSE && echo -e "[DEBUG] $*" >&2 || true; }
+
+CLUSTER_NAME=""
+REGION=""
+ORCHESTRATOR=""
+NAMESPACE=""
+JOB_NAME=""
+NODE_ID=""
+SAMPLE_NODES=3
+VERBOSE=false
+ISSUES_FOUND=0
+ISSUE_DETAILS=()
+add_issue_detail() {
+    local priority="${2:-P1}"
+    ISSUE_DETAILS+=("${priority}|$1")
+}
+K8S_CONNECTED=false
+SSM_CLUSTER_ID=""
+SSM_NODES=()
+
+usage() {
+    # Exit code 0 for --help, 2 for unknown-flag / missing-required so agents
+    # can tell the difference between "user asked for help" and "invocation wrong".
+    grep "^# USAGE:" -A 40 "$0" | grep "^#" | sed 's/^# \?//' | head -25
+    exit "${1:-0}"
+}
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --cluster)       [[ $# -lt 2 ]] && { error "--cluster needs a value"; exit 2; }
+                         [[ ! "$2" =~ ^(arn:aws[a-z-]*:sagemaker:[a-z0-9-]+:[0-9]{12}:cluster/[a-z0-9]{12}|[a-zA-Z0-9]([-a-zA-Z0-9]{0,62}))$ ]] && { error "--cluster must be a valid HyperPod cluster name or ARN (got '$2')"; exit 2; }
+                         CLUSTER_NAME="$2"; shift 2 ;;
+        --region)        [[ $# -lt 2 ]] && { error "--region needs a value"; exit 2; }
+                         [[ ! "$2" =~ ^[a-z]{2}-[a-z]+-[0-9]+$ ]] && { error "--region must be a valid AWS region (got '$2')"; exit 2; }
+                         REGION="$2"; shift 2 ;;
+        --orchestrator)  [[ $# -lt 2 ]] && { error "--orchestrator needs a value"; exit 2; }
+                         [[ "$2" != "eks" && "$2" != "slurm" ]] && { error "--orchestrator must be 'eks' or 'slurm' (got '$2')"; exit 2; }
+                         ORCHESTRATOR="$2"; shift 2 ;;
+        --namespace)     [[ $# -lt 2 ]] && { error "--namespace needs a value"; exit 2; }
+                         [[ ! "$2" =~ ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ ]] && { error "--namespace must be a valid K8s namespace (got '$2')"; exit 2; }
+                         NAMESPACE="$2"; shift 2 ;;
+        --job)           [[ $# -lt 2 ]] && { error "--job needs a value"; exit 2; }
+                         [[ ! "$2" =~ ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ ]] && { error "--job must be a valid K8s name (got '$2')"; exit 2; }
+                         JOB_NAME="$2"; shift 2 ;;
+        --node)          [[ $# -lt 2 ]] && { error "--node needs a value"; exit 2; }
+                         [[ ! "$2" =~ ^i-[0-9a-f]{8,17}$ ]] && { error "--node must be an EC2 instance ID (got '$2')"; exit 2; }
+                         NODE_ID="$2"; shift 2 ;;
+        --sample-nodes)  [[ $# -lt 2 ]] && { error "--sample-nodes needs a value"; exit 2; }; SAMPLE_NODES="$2"; shift 2 ;;
+        --verbose)       VERBOSE=true; shift ;;
+        --no-color)      RED=''; GREEN=''; YELLOW=''; BLUE=''; BOLD=''; RESET=''; shift ;;
+        --help|-h)       usage 0 ;;
+        *) echo "Unknown option: $1" >&2; usage 2 ;;
+    esac
+done
+
+[[ -z "$CLUSTER_NAME" || -z "$REGION" ]] && \
+    error "Missing required: --cluster and --region" && exit 1
+
+if ! [[ "$SAMPLE_NODES" =~ ^[0-9]+$ ]] || [[ "$SAMPLE_NODES" -lt 1 ]]; then
+    error "--sample-nodes must be a positive integer (got: '$SAMPLE_NODES')"
+    exit 1
+fi
+if [[ "$SAMPLE_NODES" -gt 50 ]]; then
+    warn "--sample-nodes=$SAMPLE_NODES is very high (max recommended: 50). Capping at 50."
+    SAMPLE_NODES=50
+fi
+
+# Paginate a sagemaker list-* call. Usage:
+#   sagemaker_list_paginated list-cluster-nodes ClusterNodeSummaries [extra args...]
+# Returns {"<SummaryKey>": [...]} on stdout. Caps at 20 000 items; emits a
+# stderr warning if truncated. Returns an empty result on AccessDenied.
+sagemaker_list_paginated() {
+    local api="$1" summary_key="$2"
+    shift 2
+    local merged='[]' token='' page_json combined i=0
+    local max_pages=200
+    while (( i < max_pages )); do
+        local page_args=(--cluster-name "$CLUSTER_NAME" --region "$REGION" \
+                         --max-results 100 --cli-read-timeout 30 --output json "$@")
+        # Validate token format before sending — avoid BadRequest on garbage.
+        if [[ -n "$token" ]]; then
+            if [[ "$token" =~ ^[a-zA-Z0-9/+]*={0,2}$ ]] && [[ -n "$token" ]]; then
+                page_args+=(--next-token "$token")
+            else
+                break
+            fi
+        fi
+        page_json=$(aws sagemaker "$api" "${page_args[@]}" 2>&1) || break
+        if echo "$page_json" | grep -qiE "AccessDenied|UnauthorizedOperation|not authorized"; then
+            break
+        fi
+        # Merge via stdin (NUL-delimited) to avoid ARG_MAX truncation at ~500
+        # entries. The summary_key is small so we keep it in argv.
+        combined=$(printf '%s\0%s' "$merged" "$page_json" | python3 -c "
+import sys, json
+blob = sys.stdin.buffer.read()
+try:
+    a, b = blob.split(b'\0', 1)
+    prev = json.loads(a)
+    page = json.loads(b)
+except (json.JSONDecodeError, ValueError):
+    sys.exit(2)
+prev.extend(page.get(sys.argv[1], []))
+print(json.dumps(prev))
+print(page.get('NextToken', ''))
+" "$summary_key" 2>/dev/null) || break
+        merged=$(printf '%s\n' "$combined" | sed -n '1p')
+        token=$(printf '%s\n'  "$combined" | sed -n '2p')
+        i=$((i+1))
+        [[ -z "$token" ]] && break
+    done
+    if (( i == max_pages )) && [[ -n "$token" ]]; then
+        echo "WARN: sagemaker_list_paginated($api): truncated at ${max_pages} pages (~$((max_pages*100)) items). Result may be incomplete for very large clusters." >&2
+    fi
+    # Final wrap via stdin — argv path would hit ARG_MAX at ~500 entries.
+    printf '%s' "$merged" | python3 -c "
+import sys, json
+try:
+    print(json.dumps({sys.argv[1]: json.loads(sys.stdin.read())}))
+except json.JSONDecodeError:
+    print('{\"%s\":[]}' % sys.argv[1])
+" "$summary_key" 2>/dev/null || echo "{\"$summary_key\":[]}"
+}
+
+detect_orchestrator() {
+    if [[ -n "$ORCHESTRATOR" ]]; then
+        info "Orchestrator forced: $ORCHESTRATOR"; return
+    fi
+
+    header "Detecting Orchestrator Type"
+    local orch_type
+    orch_type=$(aws sagemaker describe-cluster \
+        --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+        --query 'Orchestrator' --output text 2>/dev/null || echo "")
+
+    if echo "$orch_type" | grep -qi "eks\|kubernetes"; then
+        ORCHESTRATOR="eks"
+    elif echo "$orch_type" | grep -qi "slurm"; then
+        ORCHESTRATOR="slurm"
+    elif kubectl cluster-info &>/dev/null 2>&1; then
+        ORCHESTRATOR="eks"; info "Auto-detected: EKS (kubectl responds)"
+    elif command -v sinfo &>/dev/null && sinfo &>/dev/null 2>&1; then
+        ORCHESTRATOR="slurm"; info "Auto-detected: Slurm (sinfo responds)"
+    elif command -v squeue &>/dev/null; then
+        ORCHESTRATOR="slurm"; info "Auto-detected: Slurm (squeue found)"
+    else
+        ORCHESTRATOR="eks"
+        warn "Could not auto-detect orchestrator — defaulting to 'eks'"
+        warn "Override with: --orchestrator slurm"
+    fi
+    success "Orchestrator: ${ORCHESTRATOR^^}"
+}
+
+check_prerequisites() {
+    header "Checking Prerequisites"
+
+    local missing=()
+    local tool_path
+    for tool in aws jq python3; do
+        if tool_path=$(command -v "$tool" 2>/dev/null) && [[ -n "$tool_path" ]]; then
+            success "$tool: $tool_path"
+        else
+            error "$tool NOT found — required"
+            missing+=("$tool")
+        fi
+    done
+
+    if [[ "$ORCHESTRATOR" == "eks" ]]; then
+        if tool_path=$(command -v kubectl 2>/dev/null) && [[ -n "$tool_path" ]]; then
+            success "kubectl: $tool_path"
+        else
+            error "kubectl NOT found — required for EKS"
+            missing+=("kubectl")
+        fi
+    elif [[ "$ORCHESTRATOR" == "slurm" ]]; then
+        local slurm_ok=false
+        for t in sinfo squeue scontrol; do
+            command -v "$t" &>/dev/null && { success "$t found (Slurm CLI OK)"; slurm_ok=true; break; }
+        done
+        $slurm_ok || warn "Slurm CLI not found locally — will use SSM for Slurm commands"
+    fi
+
+    [[ ${#missing[@]} -gt 0 ]] && { error "Install: ${missing[*]}"; exit 1; }
+
+    if aws sts get-caller-identity --region "$REGION" &>/dev/null; then
+        local id
+        id=$(aws sts get-caller-identity --region "$REGION" --query 'Arn' --output text)
+        success "AWS credentials: $id"
+    else
+        error "AWS credentials invalid or expired"; exit 1
+    fi
+
+    # This is the critical check: we actually call the API and inspect both
+    # stdout (node list) and stderr (error messages). We never assume that
+    # an empty response or a non-zero exit means the cluster is fine.
+    if [[ "$ORCHESTRATOR" == "eks" ]]; then
+        local kubectl_out kubectl_err tmpfile
+        tmpfile=$(mktemp /tmp/kubectl-check-XXXXXX.err)
+        _TEMP_FILES+=("$tmpfile")
+        kubectl_out=$(kubectl get nodes --no-headers 2>"$tmpfile" || true)
+        kubectl_err=$(cat "$tmpfile" 2>/dev/null || echo "")
+        rm -f "$tmpfile"
+
+        debug "kubectl stdout: '$kubectl_out'"
+        debug "kubectl stderr: '$kubectl_err'"
+
+        if echo "$kubectl_err" | grep -qiE \
+            "Unauthorized|forbidden|You must be logged in|certificate|no configuration|Unable to connect|server.*refused"; then
+            error "kubectl NOT authenticated to EKS cluster"
+            error "  $(echo "$kubectl_err" | head -1)"
+            warn  "  K8s checks (2, 2b, 5, 5b, 6, 7, 9) will be SKIPPED"
+            K8S_CONNECTED=false
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "kubectl not authenticated to EKS cluster → references/operations.md § 4 SSM Setup" "P1"
+        elif echo "$kubectl_err" | grep -qiE \
+            "connection refused|no such host|dial tcp|context deadline exceeded|EOF"; then
+            error "kubectl cannot reach EKS API server → references/operations.md § 10 Troubleshooting the Skill Itself"
+            error "  $(echo "$kubectl_err" | head -1)"
+            warn  "  K8s checks (2, 2b, 5, 5b, 6, 7, 9) will be SKIPPED — check VPN/network connectivity"
+            K8S_CONNECTED=false
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "kubectl cannot reach EKS API server → references/operations.md § 10 Troubleshooting the Skill Itself" "P1"
+        elif [[ -z "$kubectl_out" && -z "$kubectl_err" ]]; then
+            warn "kubectl returned no output — kubeconfig may point to wrong cluster"
+            warn "  → references/operations.md § 2 Getting Cluster Names"
+            K8S_CONNECTED=true   # Allow K8s checks — cluster may simply have no nodes yet
+        elif [[ -n "$kubectl_err" && -z "$kubectl_out" ]]; then
+
+            error "kubectl error: $(echo "$kubectl_err" | head -1)"
+            warn  "  K8s checks (2, 2b, 5, 5b, 6, 7, 9) will be SKIPPED"
+            K8S_CONNECTED=false
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "kubectl error — K8s checks skipped → references/operations.md § 10 Troubleshooting the Skill Itself" "P1"
+        else
+            local node_count
+            node_count=$(echo "$kubectl_out" | wc -l | tr -d ' ')
+            success "kubectl authenticated — $node_count node(s) visible"
+            K8S_CONNECTED=true
+        fi
+    fi
+}
+
+check_cluster_health() {
+    header "Check 1: HyperPod Cluster Health"
+
+    local cluster_json
+    cluster_json=$(aws sagemaker describe-cluster \
+        --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+        --output json 2>&1) || {
+        if echo "$cluster_json" | grep -qiE "ResourceNotFound|Cluster with name .* not found|ValidationException"; then
+            error "Cluster '$CLUSTER_NAME' not found in region '$REGION'"
+            echo "$cluster_json" | head -3
+            echo ""
+            echo "Available clusters in $REGION:"
+            aws sagemaker list-clusters --region "$REGION" \
+                --query 'ClusterSummaries[*].{Name:ClusterName,Status:ClusterStatus}' \
+                --output table 2>/dev/null || echo "  (unable to list clusters — check IAM)"
+            exit 1
+        fi
+        if echo "$cluster_json" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+            warn "Permission denied: sagemaker:DescribeCluster — check IAM policy"
+        fi
+        cluster_json="{}"
+    }
+
+    local cluster_state
+    cluster_state=$(echo "$cluster_json" | python3 -c \
+        "import sys,json; print(json.load(sys.stdin).get('ClusterStatus','UNKNOWN'))" 2>/dev/null \
+        || echo "UNKNOWN")
+
+    case "$cluster_state" in
+        InService)
+            success "Cluster status: $cluster_state" ;;
+        UNKNOWN|None|"")
+            warn "Cluster status: could not retrieve"
+            warn "  Ensure --cluster is the HyperPod cluster name and IAM has sagemaker:DescribeCluster" ;;
+        Creating|Updating|RollingBack|SystemUpdating)
+            warn "Cluster status: $cluster_state (operation in progress — NCCL checks may be partial)"
+            add_issue_detail "Cluster in transient state $cluster_state — rerun after it completes → hyperpod-cluster-debugger skill if it stays stuck" "P2" ;;
+        Deleting|DeleteFailed)
+            error "Cluster status: $cluster_state (cluster is being torn down)"
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "Cluster is ${cluster_state} → hyperpod-cluster-debugger skill" "P0" ;;
+        Failed|ClusterMaintenanceRollbackFailed)
+            error "Cluster status: $cluster_state (expected: InService)"
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "Cluster status ${cluster_state} → hyperpod-cluster-debugger skill" "P0" ;;
+        *)
+            warn "Cluster status: $cluster_state (unrecognized state)"
+            add_issue_detail "Unrecognized cluster state '${cluster_state}' → hyperpod-cluster-debugger skill" "P1" ;;
+    esac
+
+    # NodeRecovery — affects whether failed nodes are auto-replaced.
+    # Prefer top-level NodeRecovery (the canonical location); InstanceGroups[*].NodeRecovery
+    # is null when cluster-level setting is applied, so per-group-only reads always return 'Unknown'.
+    local node_recovery
+    node_recovery=$(echo "$cluster_json" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+top=d.get('NodeRecovery')
+if top:
+    print(top)
+else:
+    igs = d.get('InstanceGroups',[])
+    modes = sorted({ig.get('NodeRecovery') for ig in igs if ig.get('NodeRecovery')})
+    print(','.join(modes) if modes else 'Unknown')
+" 2>/dev/null || echo "Unknown")
+
+    if echo "$node_recovery" | grep -q "Automatic"; then
+        success "NodeRecovery: $node_recovery (auto-repair enabled)"
+    elif echo "$node_recovery" | grep -qi "^Unknown$"; then
+        info "NodeRecovery: could not retrieve (needs sagemaker:DescribeCluster)"
+    elif echo "$node_recovery" | grep -qi "^None$"; then
+        warn "NodeRecovery: None — failed nodes won't auto-replace → references/operations.md § 8 Node Health Auto-Repair"
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "NodeRecovery disabled (set to 'None') — failed nodes won't auto-replace → references/operations.md § 8 Node Health Auto-Repair" "P2"
+    else
+        warn "NodeRecovery: $node_recovery — failed nodes won't auto-replace → references/operations.md § 8 Node Health Auto-Repair"
+    fi
+
+    # All instance groups — count nodes per group, surface any unhealthy count.
+    # Paginated because clusters >50 nodes would otherwise be diagnosed on a partial sample.
+    local node_summary
+    node_summary=$(sagemaker_list_paginated list-cluster-nodes ClusterNodeSummaries)
+
+    local node_output
+    node_output=$(echo "$node_summary" | python3 -c "
+import sys,json
+nodes = json.load(sys.stdin).get('ClusterNodeSummaries',[])
+total = len(nodes)
+by_status = {}
+for n in nodes:
+    s = n.get('InstanceStatus',{}).get('Status','Unknown')
+    by_status[s] = by_status.get(s,0) + 1
+print(f'  Total nodes: {total}')
+for s,c in sorted(by_status.items()):
+    tag = '[PASS]' if s == 'Running' else '[FAIL]'
+    print(f'  {tag} {s}: {c}')
+failed = [n for n in nodes if n.get('InstanceStatus',{}).get('Status') not in ('Running','Pending')]
+for n in failed[:10]:
+    msg = n.get('InstanceStatus',{}).get('Message','')
+    print(f'    -> {n[\"InstanceId\"]} ({n[\"InstanceGroupName\"]}): {msg[:120]}')
+print(f'FAILED_COUNT={len(failed)}')
+" 2>/dev/null || echo "FAILED_COUNT=0")
+
+    local fc
+    fc=$(echo "$node_output" | grep "^FAILED_COUNT=" | cut -d= -f2 || echo 0)
+    # `|| true` on grep — no-match returns 1 and pipefail aborts the function.
+    echo "$node_output" | { grep -v "^FAILED_COUNT=" || true; } | while IFS= read -r line; do
+        if echo "$line" | grep -q "\[FAIL\]"; then
+            error "$line"
+        else
+            echo "$line"
+        fi
+    done
+    if [[ "${fc:-0}" -gt 0 ]]; then
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "${fc} node(s) in failed/non-Running state → hyperpod-node-debugger skill" "P1"
+    fi
+
+    # Pre-flight: does this cluster have any GPU/EFA-capable instance groups?
+    # If not, NCCL is simply not applicable here — tell the user clearly instead
+    # of running the whole diagnostic and emitting a confusing mix of INFO/SKIP.
+    local gpu_groups
+    gpu_groups=$(echo "$cluster_json" | python3 -c "
+import sys, json, re
+d = json.load(sys.stdin)
+igs = d.get('InstanceGroups', [])
+gpu_efa_re = re.compile(r'^ml\.(p4d|p4de|p5|p5e|p5en|p6|trn1|trn2|g5\.48xlarge|g6\.48xlarge|g6e\.48xlarge)', re.I)
+matches = [ig.get('InstanceGroupName','?') + ':' + ig.get('InstanceType','?')
+           for ig in igs if gpu_efa_re.match(ig.get('InstanceType',''))]
+print('|'.join(matches))
+" 2>/dev/null || echo "")
+    if [[ -z "$gpu_groups" ]] && [[ -n "$(echo "$cluster_json" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("InstanceGroups",[]))' 2>/dev/null)" ]]; then
+        warn "No GPU/EFA-capable instance groups in this cluster — NCCL is not applicable"
+        warn "  NCCL is only meaningful on multi-GPU instances with EFA (p4d/p4de/p5/p5e/p5en/p6/trn1/trn2/g5.48xlarge/g6.48xlarge/g6e.48xlarge)"
+        warn "  The rest of the diagnostic will still run, but most checks will return INFO/SKIP on CPU-only fleets"
+    fi
+}
+
+check_cluster_events() {
+    header "Check 3: Cluster Events (infrastructure signals)"
+
+    # HyperPod cluster events report infrastructure-level state only:
+    # lifecycle, bootstrap, EFA health-check, capacity, replacement, reboot,
+    # software update. They do NOT carry NCCL / GPU / training-level signals —
+    # those come from pod logs, CloudWatch, and on-node probes (checks 6–8).
+    local events_json
+    events_json=$(sagemaker_list_paginated list-cluster-events ClusterEventSummaries)
+    local events
+    events=$(echo "$events_json" | python3 -c "
+import sys, json
+summaries = json.load(sys.stdin).get('ClusterEventSummaries', [])
+proj = [{'Time': e.get('Timestamp',''),
+         'Sev':  e.get('EventSeverity',''),
+         'Src':  e.get('SourceInstanceId',''),
+         'Msg':  e.get('Message','')} for e in summaries]
+print(json.dumps(proj))
+" 2>/dev/null || echo "[]")
+
+    local infra_events
+    infra_events=$(echo "$events" | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+# Match real HyperPod event messages that could block or degrade distributed training.
+keywords = [
+    'efa health checks',         # 'EFA health checks did not run successfully'
+    'lifecycle script',          # 'Lifecycle scripts did not run successfully' / 'execution timed out'
+    'bootstrap failed',          # 'Instance bootstrap failed likely because of customer network misconfiguration'
+    'network misconfiguration',  # appears in bootstrap-failed events
+    'insufficient capacity',     # 'Insufficient capacity' / 'No subnets in the capacity AZ'
+    'failed to provision',       # provisioning events
+    'hardware failure',          # rare; surfaces via events when SMHP detects
+    'replacement',               # node replacement activity
+    'reboot',                    # node reboot activity
+    'rollback',                  # AMI upgrade rollback
+]
+found = [e for e in data if any(k in e.get('Msg','').lower() for k in keywords)]
+for e in found[:20]:
+    print(f\"[{e.get('Sev','?')}] {e.get('Time','?')[:19]} | {e.get('Src','?')[:22]} | {e.get('Msg','?')[:140]}\")
+print(f'COUNT={len(found)}')
+" 2>/dev/null || echo "COUNT=0")
+
+    local count
+    count=$(echo "$infra_events" | grep "^COUNT=" | cut -d= -f2 || echo 0)
+    local lines
+    lines=$(echo "$infra_events" | grep -v "^COUNT=" || true)
+
+    if [[ -z "$lines" || "${count:-0}" -eq 0 ]]; then
+        success "No infrastructure events that would block NCCL"
+        if [[ "$ORCHESTRATOR" == "slurm" ]]; then
+            info "(Cluster events may not be populated for HyperPod Slurm clusters — rely on pod-/job-log checks instead.)"
+        fi
+    else
+        warn "Infrastructure events potentially affecting NCCL (last 100):"
+        echo "$lines" | while IFS= read -r line; do
+            if echo "$line" | grep -qiE "error|fail|timeout|rollback"; then
+                error "  $line"
+            else
+                warn "  $line"
+            fi
+        done
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "Infrastructure-level events found — review and cross-reference with cluster-debugger if root-cause is cluster-wide → references/debugging-guide.md (match event text to section)" "P1"
+    fi
+}
+
+check_security_groups() {
+    header "Check 4: Security Group Rules (EFA / NCCL Communication)"
+
+    local cluster_json
+    cluster_json=$(aws sagemaker describe-cluster \
+        --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+        --output json 2>/dev/null || echo "{}")
+
+    # DescribeCluster.VpcConfig returns SecurityGroupIds + Subnets (not SubnetIds).
+    # There is no VpcId on VpcConfig — we'd need to derive it from a subnet. Skip it.
+    local sgs subnets
+    sgs=$(echo "$cluster_json" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+print(','.join(d.get('VpcConfig',{}).get('SecurityGroupIds',[])))
+" 2>/dev/null || echo "")
+    subnets=$(echo "$cluster_json" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+print(','.join(d.get('VpcConfig',{}).get('Subnets',[])))
+" 2>/dev/null || echo "")
+
+    info "SGs: ${sgs:-none}  |  Subnets: ${subnets:-none}"
+
+    if [[ -z "$sgs" ]]; then
+        warn "No security groups in cluster VPC config — cannot verify NCCL rules"
+        warn "  (DescribeCluster may need sagemaker:DescribeCluster permission)"
+        return
+    fi
+
+    IFS=',' read -ra sg_list <<< "$sgs"
+    for sg in "${sg_list[@]}"; do
+        [[ -z "$sg" ]] && continue
+        section "SG: $sg"
+
+        local sg_json
+        sg_json=$(aws ec2 describe-security-groups \
+            --group-ids "$sg" --region "$REGION" \
+            --query 'SecurityGroups[0]' --output json 2>&1) || {
+            if echo "$sg_json" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+                warn "Permission denied: ec2:DescribeSecurityGroups — check IAM policy"
+            fi
+            sg_json="{}"
+        }
+
+        local self_in self_out all_out
+        read -r self_in self_out all_out < <(echo "$sg_json" | python3 -c "
+import sys,json
+sg=json.load(sys.stdin)
+gid=sg.get('GroupId','')
+def has_self(rules):
+    return any(any(p.get('GroupId')==gid for p in r.get('UserIdGroupPairs',[])) for r in rules)
+def has_all_out(rules):
+    return any(r.get('IpProtocol')=='-1' and any(x.get('CidrIp')=='0.0.0.0/0' for x in r.get('IpRanges',[])) for r in rules)
+print('YES' if has_self(sg.get('IpPermissions',[])) else 'NO',
+      'YES' if has_self(sg.get('IpPermissionsEgress',[])) else 'NO',
+      'YES' if has_all_out(sg.get('IpPermissionsEgress',[])) else 'NO')
+" 2>/dev/null || echo "UNKNOWN UNKNOWN UNKNOWN")
+
+        # Inbound self-reference — required for NCCL inter-node rendezvous
+        if [[ "$self_in" == "YES" ]]; then
+            success "  Inbound self-reference: PRESENT (inter-node communication OK)"
+        else
+            error "  Inbound self-reference: MISSING — NCCL inter-node comm WILL FAIL"
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "SG $sg missing inbound self-referencing rule → references/operations.md § Security Groups" "P0"
+        fi
+
+        # Outbound self-reference — required for EFA traffic
+        if [[ "$self_out" == "YES" ]]; then
+            success "  Outbound self-reference: PRESENT (EFA traffic OK)"
+        else
+            error "  Outbound self-reference: MISSING — EFA traffic WILL FAIL"
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "SG $sg missing outbound self-referencing rule → references/operations.md § Security Groups" "P0"
+        fi
+
+        if [[ "$all_out" == "YES" ]]; then
+            success "  Outbound 0.0.0.0/0: PRESENT (API/internet OK)"
+        else
+            warn    "  Outbound 0.0.0.0/0: MISSING — may block SageMaker/S3 API calls"
+        fi
+    done
+}
+
+check_k8s_nodes() {
+    header "Check 2: Kubernetes Node Readiness"
+
+    local raw_nodes total not_ready
+    raw_nodes=$(kubectl get nodes --no-headers 2>/dev/null || true)
+    total=$(echo "$raw_nodes" | awk 'NF{c++} END{print c+0}')
+    not_ready=$(echo "$raw_nodes" | { grep -vE " Ready" || true; } | awk 'NF{c++} END{print c+0}')
+
+    info "Total K8s nodes: $total"
+
+    if [[ "$not_ready" -eq 0 ]]; then
+        success "All $total nodes are Ready"
+    else
+        error "$not_ready/$total nodes NOT Ready"
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "$not_ready/$total K8s nodes not Ready → hyperpod-node-debugger skill" "P1"
+        echo "$raw_nodes" | { grep -vE " Ready" || true; } | while read -r line; do
+            error "  Not Ready: $line"
+        done
+    fi
+
+    section "HyperPod Health Labels (all nodes)"
+    # Uses the 4 documented node-health-status values plus deep-health-check-status
+    local health_output
+    health_output=$(kubectl get nodes -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+issues = 0
+for node in data.get('items', []):
+    name = node['metadata']['name']
+    labels = node['metadata'].get('labels', {})
+    health       = labels.get('sagemaker.amazonaws.com/node-health-status', '')
+    deep         = labels.get('sagemaker.amazonaws.com/deep-health-check-status', '')
+    fault_type   = labels.get('sagemaker.amazonaws.com/fault-type', '')
+    fault_reason = labels.get('sagemaker.amazonaws.com/fault-reason', '')
+
+    ok = health in ('', 'Schedulable') and deep in ('', 'Passed') and not fault_type
+    tag = '[PASS]' if ok else '[FAIL]'
+    if not ok:
+        issues += 1
+    line = f'  {tag} {name}: health={health or \"(none)\"}'
+    if deep:       line += f'  deep={deep}'
+    if fault_type: line += f'  fault={fault_type}'
+    print(line)
+    if health == 'Unschedulable':
+        print('         -> Running deep health checks (~2h), temporarily unavailable')
+    elif health == 'UnschedulablePendingReplacement':
+        print('         -> Failed health checks — needs replacement (NodeRecovery=Automatic will auto-replace)')
+    elif health == 'UnschedulablePendingReboot':
+        print('         -> Unhealthy — rebooting to re-run health checks')
+    if deep == 'InProgress': print('         -> Deep health check in progress')
+    elif deep == 'Failed':   print('         -> Deep health check FAILED — node will be replaced')
+    if fault_type: print(f'         -> Fault: {fault_type} | {fault_reason}')
+print(f'ISSUES={issues}')
+" 2>/dev/null || echo "ISSUES=0")
+
+    local health_issues
+    health_issues=$(echo "$health_output" | grep "^ISSUES=" | cut -d= -f2 || echo 0)
+    echo "$health_output" | { grep -v "^ISSUES=" || true; } | while IFS= read -r line; do
+        echo -e "$line"
+    done
+    # Using `if` instead of `[[ ... ]] && ...` — the short-circuit form returns
+    # non-zero when the test is false, which aborts the script under `set -e`
+    # and silently skips every remaining check (pods, env vars, hardware).
+    if [[ "${health_issues:-0}" -gt 0 ]]; then
+        ISSUES_FOUND=$((ISSUES_FOUND + health_issues))
+    fi
+}
+
+check_pod_status() {
+    local ns_flag ns_label
+    if [[ -n "$NAMESPACE" ]]; then
+        ns_flag=(-n "$NAMESPACE")
+        ns_label="'$NAMESPACE'"
+    else
+        ns_flag=(-A)
+        ns_label="all namespaces"
+    fi
+    header "Check 5: Pod / Job Status ($ns_label)"
+
+    local job_filter=()
+    [[ -n "$JOB_NAME" ]] && job_filter=(-l "job-name=$JOB_NAME")
+
+    # `${arr[@]+"${arr[@]}"}` — expand only if defined; plain `${arr[@]}`
+    # trips `set -u` on empty arrays under bash 4.2 (AL2 default).
+    local pods_json
+    pods_json=$(kubectl get pods "${ns_flag[@]}" ${job_filter[@]+${job_filter[@]+"${job_filter[@]}"}} -o json 2>/dev/null \
+                || echo '{"items":[]}')
+
+    local pod_output
+    pod_output=$(python3 -c "
+import sys, json
+d = json.load(sys.stdin)
+items = d['items']
+total   = len(items)
+failed  = [p for p in items if p.get('status',{}).get('phase') in ('Failed','Unknown')]
+pending = [p for p in items if p.get('status',{}).get('phase') == 'Pending']
+crashes = []
+for p in items:
+    for cs in p.get('status',{}).get('containerStatuses',[]):
+        if cs.get('restartCount',0)>2 or cs.get('state',{}).get('waiting',{}).get('reason') \
+           in ('CrashLoopBackOff','OOMKilled','Error'):
+            crashes.append(p)
+            break
+
+print(f'TOTAL={total}')
+print(f'FAILED={len(failed)}')
+print(f'PENDING={len(pending)}')
+print(f'CRASH={len(crashes)}')
+
+for p in failed[:5]:
+    name = p['metadata']['name']
+    ns   = p['metadata']['namespace']
+    msg  = p.get('status',{}).get('message','')[:150]
+    print(f'FAILED_POD={ns}/{name}: {msg}')
+for p in pending[:5]:
+    name = p['metadata']['name']
+    ns   = p['metadata']['namespace']
+    for c in p.get('status',{}).get('conditions',[]):
+        if c.get('status')=='False':
+            print(f'PENDING_POD={ns}/{name}: {c.get(\"message\",\"\")[:120]}')
+for p in crashes[:5]:
+    name = p['metadata']['name']
+    ns   = p['metadata']['namespace']
+    for cs in p.get('status',{}).get('containerStatuses',[]):
+        r = cs.get('state',{}).get('waiting',{}).get('reason','CrashLoop')
+        print(f'CRASH_POD={ns}/{name}: {r} restarts={cs.get(\"restartCount\",0)}')
+        break
+" <<< "$pods_json" 2>/dev/null || echo "TOTAL=0
+FAILED=0
+PENDING=0
+CRASH=0")
+
+    # Parse counts outside of pipe to avoid subshell variable loss
+    local p_total p_failed p_pending p_crash
+    p_total=$(echo "$pod_output" | grep "^TOTAL=" | cut -d= -f2 || echo 0)
+    p_failed=$(echo "$pod_output" | grep "^FAILED=" | cut -d= -f2 || echo 0)
+    p_pending=$(echo "$pod_output" | grep "^PENDING=" | cut -d= -f2 || echo 0)
+    p_crash=$(echo "$pod_output" | grep "^CRASH=" | cut -d= -f2 || echo 0)
+
+    info "  Total pods: ${p_total:-0}"
+
+    if [[ "${p_failed:-0}" -gt 0 ]]; then
+        error "  Failed/Unknown pods: $p_failed"; ISSUES_FOUND=$((ISSUES_FOUND+1)); add_issue_detail "$p_failed Failed/Unknown pod(s) → references/debugging-guide.md § 20 Pending / CrashLoopBackOff / Init-Container Failures" "P1"
+    else
+        success "  No failed pods"
+    fi
+    if [[ "${p_pending:-0}" -gt 0 ]]; then
+        warn "  Pending pods: $p_pending"; ISSUES_FOUND=$((ISSUES_FOUND+1)); add_issue_detail "$p_pending Pending pod(s) → references/debugging-guide.md § 20 Pending / CrashLoopBackOff / Init-Container Failures" "P1"
+    else
+        success "  No pending pods"
+    fi
+    if [[ "${p_crash:-0}" -gt 0 ]]; then
+        error "  CrashLoop/OOM pods: $p_crash"; ISSUES_FOUND=$((ISSUES_FOUND+1)); add_issue_detail "$p_crash CrashLoopBackOff/OOM pod(s) → references/debugging-guide.md § 20 Pending / CrashLoopBackOff / Init-Container Failures" "P1"
+    else
+        success "  No crashloop pods"
+    fi
+
+    # `|| true` — grep returns 1 on no-match; with `pipefail` that kills the
+    # whole function, silently skipping the rest of the diagnostic.
+    echo "$pod_output" | { grep "^FAILED_POD=" || true; } | while IFS= read -r line; do error "    ${line#FAILED_POD=}"; done
+    echo "$pod_output" | { grep "^PENDING_POD=" || true; } | while IFS= read -r line; do warn  "    ${line#PENDING_POD=}"; done
+    echo "$pod_output" | { grep "^CRASH_POD="   || true; } | while IFS= read -r line; do error "    ${line#CRASH_POD=}"; done
+}
+
+# Checks EKS-specific prerequisites that cause NCCL failures before training starts:
+#   - Headless service for MASTER_ADDR DNS resolution
+#   - Init container failures blocking training containers
+#   - /dev/shm volume mount (K8s default 64MB is too small for NCCL)
+check_nccl_infra_prereqs() {
+    header "Check 5b: NCCL Infrastructure Prerequisites"
+
+    local ns_flag ns_label
+    if [[ -n "$NAMESPACE" ]]; then
+        ns_flag=(-n "$NAMESPACE")
+        ns_label="'$NAMESPACE'"
+    else
+        ns_flag=(-A)
+        ns_label="all namespaces"
+    fi
+
+    local job_filter=()
+    [[ -n "$JOB_NAME" ]] && job_filter=(-l "job-name=$JOB_NAME")
+
+    # MASTER_ADDR DNS resolution requires a headless service (ClusterIP: None)
+    # Without it, pods get DNS like "10-0-1-5.default.pod.cluster.local" which
+    # doesn't resolve from other pods → rendezvous timeout
+    section "Headless Service (MASTER_ADDR DNS)"
+    local headless_svcs
+    headless_svcs=$(kubectl get svc "${ns_flag[@]}" -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+found = []
+for svc in data.get('items', []):
+    spec = svc.get('spec', {})
+    if spec.get('clusterIP') == 'None':
+        name = svc['metadata']['name']
+        ns = svc['metadata']['namespace']
+        sel = spec.get('selector', {})
+        found.append(f'{ns}/{name} selector={sel}')
+print(f'COUNT={len(found)}')
+for f in found[:10]:
+    print(f)
+" 2>/dev/null || echo "COUNT=0")
+
+    local hl_count
+    hl_count=$(echo "$headless_svcs" | grep "^COUNT=" | cut -d= -f2 || echo 0)
+    if [[ "${hl_count:-0}" -gt 0 ]]; then
+        success "Headless service(s) found (${hl_count}) — MASTER_ADDR DNS can resolve"
+        echo "$headless_svcs" | { grep -v "^COUNT=" || true; } | while IFS= read -r line; do
+            [[ -n "$line" ]] && info "  $line"
+        done
+    else
+        warn "No headless services found in $ns_label"
+        warn "  If MASTER_ADDR uses a hostname, DNS resolution will fail"
+        warn "  Example: spec.clusterIP: None, spec.selector: {app: my-training-job}"
+    fi
+
+    # Init containers must complete before training container starts.
+    # Common failures: S3 data download, config fetch, health check wait
+    section "Init Container Status"
+    local init_issues
+    init_issues=$(kubectl get pods "${ns_flag[@]}" ${job_filter[@]+"${job_filter[@]}"} -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+issues = 0
+for pod in data.get('items', []):
+    name = pod['metadata']['name']
+    ns = pod['metadata']['namespace']
+    for ics in pod.get('status', {}).get('initContainerStatuses', []):
+        state = ics.get('state', {})
+        if 'waiting' in state:
+            reason = state['waiting'].get('reason', '')
+            msg = state['waiting'].get('message', '')[:100]
+            if reason in ('CrashLoopBackOff', 'Error', 'ImagePullBackOff', 'ErrImagePull'):
+                print(f'FAIL:{ns}/{name}: init container \"{ics[\"name\"]}\" {reason}: {msg}')
+                issues += 1
+        elif 'terminated' in state and state['terminated'].get('exitCode', 0) != 0:
+            reason = state['terminated'].get('reason', 'Error')
+            print(f'FAIL:{ns}/{name}: init container \"{ics[\"name\"]}\" exited {state[\"terminated\"][\"exitCode\"]}: {reason}')
+            issues += 1
+print(f'ISSUES={issues}')
+" 2>/dev/null || echo "ISSUES=0")
+
+    local init_count
+    init_count=$(echo "$init_issues" | grep "^ISSUES=" | cut -d= -f2 || echo 0)
+    if [[ "${init_count:-0}" -gt 0 ]]; then
+        error "  $init_count init container failure(s) — training containers cannot start"
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "$init_count failed init container(s) blocking training → references/debugging-guide.md § 20 Pending / CrashLoopBackOff / Init-Container Failures" "P1"
+        echo "$init_issues" | { grep "^FAIL:" || true; } | while IFS= read -r line; do
+            error "    ${line#FAIL:}"
+        done
+    else
+        success "No init container failures"
+    fi
+
+    # K8s default /dev/shm = 64MB. NCCL needs ≥1GB. Without emptyDir mount,
+    # training gets "failed to extend /dev/shm/nccl-*" or SIGBUS.
+    section "/dev/shm Volume Mount"
+    if [[ -n "$JOB_NAME" ]]; then
+        local ns="${NAMESPACE:-default}"
+        local shm_check
+        shm_check=$(kubectl get pods -n "$ns" -l "job-name=$JOB_NAME" -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+pods = data.get('items', [])
+if not pods:
+    print('NO_PODS')
+else:
+    pod = pods[0]
+    vols = pod.get('spec', {}).get('volumes', [])
+    has_dshm = any(
+        v.get('emptyDir', {}).get('medium') == 'Memory'
+        for v in vols
+        if any(vm.get('mountPath') == '/dev/shm'
+               for c in pod.get('spec', {}).get('containers', [])
+               for vm in c.get('volumeMounts', [])
+               if vm.get('name') == v.get('name'))
+    )
+    if has_dshm:
+        print('OK')
+    else:
+        print('MISSING')
+" 2>/dev/null || echo "UNKNOWN")
+
+        case "$shm_check" in
+            OK)      success "/dev/shm mounted as emptyDir Memory — NCCL shared memory OK" ;;
+            MISSING) warn "/dev/shm NOT mounted as emptyDir Memory (K8s default = 64MB)"
+                     warn "  NCCL will fail with 'failed to extend /dev/shm/nccl-*' or Bus error"
+                     warn "    volumes: [{name: dshm, emptyDir: {medium: Memory, sizeLimit: '10Gi'}}]"
+                     warn "    volumeMounts: [{name: dshm, mountPath: /dev/shm}]"
+                     ISSUES_FOUND=$((ISSUES_FOUND + 1))
+                     add_issue_detail "/dev/shm not mounted as emptyDir Memory → references/debugging-guide.md § 17 RDMA Memory Registration Failure" "P1" ;;
+            NO_PODS) info "No pods found for job '$JOB_NAME' — /dev/shm check skipped" ;;
+            *)       info "/dev/shm mount status unknown" ;;
+        esac
+    else
+        info "/dev/shm check requires --job flag (skipped)"
+    fi
+}
+
+analyze_nccl_logs() {
+    header "Check 6: NCCL Log Pattern Analysis"
+
+    local job_filter=()
+    [[ -n "$JOB_NAME" ]] && job_filter=(-l "job-name=$JOB_NAME")
+
+    local pod_entries
+    if [[ -n "$NAMESPACE" ]]; then
+        pod_entries=$(kubectl get pods -n "$NAMESPACE" ${job_filter[@]+"${job_filter[@]}"} --no-headers 2>/dev/null \
+            | awk -v ns="$NAMESPACE" '{print ns"/"$1}' || echo "")
+    else
+        pod_entries=$(kubectl get pods -A ${job_filter[@]+"${job_filter[@]}"} --no-headers 2>/dev/null \
+            | awk '{print $1"/"$2}' \
+            | grep -vE "^(kube-system|kube-public|kube-node-lease|aws-hyperpod)/" || true)
+    fi
+
+    if [[ -z "$pod_entries" ]]; then
+        info "No workload pods found to analyze logs"
+        return
+    fi
+
+    declare -A NCCL_PATTERNS=(
+        ["Timeout waiting for"]="TIMEOUT_RENDEZVOUS:Rendezvous timed out — peer ranks not responding"
+        ["Connection refused"]="CONN_REFUSED:TCP refused — check MASTER_ADDR/MASTER_PORT"
+        ["Address already in use"]="PORT_CONFLICT:Port already in use — change MASTER_PORT"
+        ["NCCL WARN Connect to"]="CONNECT_FAIL:NCCL peer connection failed — check SG/NetworkPolicy"
+        ["network is unreachable"]="NET_UNREACHABLE:Network unreachable — VPC/routing issue"
+        ["Error in Store"]="STORE_ERR:Distributed store error — usually rendezvous timeout"
+        ["DistStoreError"]="STORE_ERR:Distributed store error (PyTorch 2.x) — usually rendezvous timeout"
+        ["RendezvousConnectionError"]="RDZV_CONN_ERR:Torch elastic rendezvous connection failed — check MASTER_ADDR DNS + SG"
+        ["RendezvousTimeout"]="RDZV_TIMEOUT:Torch elastic rendezvous timed out — peers not reachable"
+        ["Name or service not known"]="DNS_FAIL:DNS resolution failed for MASTER_ADDR — check headless service or /etc/hosts"
+        ["getaddrinfo failed"]="DNS_FAIL:DNS resolution failed — headless service missing or CoreDNS issue"
+        ["Watchdog timeout"]="WATCHDOG_TIMEOUT:AllReduce watchdog expired — straggler or OOM"
+        ["unhandled system error"]="SYSTEM_ERROR:NCCL system error — GPU/EFA hardware issue"
+        ["unhandled cuda error"]="CUDA_ERROR:CUDA runtime error — GPU driver crash or hardware fault"
+        ["peer access is not supported"]="P2P_FAIL:GPU peer access blocked — ACS enabled or IOMMU misconfigured"
+        ["NCCL WARN Cuda failure"]="CUDA_ERROR:CUDA failure inside NCCL — GPU hardware or driver issue"
+        ["fi_getinfo failed"]="EFA_INIT_FAIL:EFA libfabric init failed — EFA not available or wrong NCCL_SOCKET_IFNAME"
+        ["NCCL_OFI_RDMA"]="OFI_ERROR:aws-ofi-nccl plugin error — check EFA driver and OFI NCCL version"
+        ["Call to ibv_reg_mr failed"]="RDMA_REG_FAIL:EFA/RDMA memory registration failed — memlock limit too low"
+        ["NET/OFI Using TCP"]="EFA_TCP_FALLBACK:NCCL fell back to TCP instead of EFA — 10-100x slower than expected"
+        ["Failed to load NCCL"]="NCCL_LOAD_FAIL:Failed to load NCCL library — libnccl.so missing or LD_LIBRARY_PATH wrong"
+        ["libnccl-net.so"]="OFI_LOAD_FAIL:Failed to load aws-ofi-nccl plugin — libnccl-net.so not found"
+        ["OOMKilled"]="OOM_KILL:Container killed (OOM) — reduce batch size or increase memory limit"
+        ["CUDA out of memory"]="CUDA_OOM:GPU out of memory — reduce batch size or model size"
+        ["cudaMalloc failed"]="CUDA_OOM:GPU cudaMalloc failed — reduce batch size or model size"
+        ["failed to extend /dev/shm"]="SHM_FULL:NCCL shared memory /dev/shm full — mount emptyDir with 10Gi sizeLimit"
+        ["Bus error"]="SHM_FULL:/dev/shm too small or SIGBUS — mount emptyDir with 10Gi sizeLimit"
+        ["NCCL function not found"]="NCCL_VERSION_MISMATCH:NCCL version mismatch across nodes — mixed container images"
+        ["Incompatible NCCL version"]="NCCL_VERSION_MISMATCH:NCCL version mismatch across nodes — mixed container images"
+        ["Could not find interface"]="IFACE_NOT_FOUND:NCCL_SOCKET_IFNAME points to missing interface"
+        ["world_size mismatch"]="WORLD_SIZE_MISMATCH:WORLD_SIZE doesn't match running process count"
+        ["doesn't have NCCL built in"]="NCCL_NOT_BUILT:PyTorch compiled without NCCL — rebuild with USE_NCCL=1 or use AWS DLC image"
+        ["CUDA_VISIBLE_DEVICES"]="CUDA_VIS_DEV:CUDA_VISIBLE_DEVICES misconfigured — GPUs not visible to training process"
+        ["unlink shared memory"]="SHM_STALE:Stale NCCL shared memory from previous run — systemd RemoveIPC=yes or manual cleanup"
+        ["Call to ncclCommAbort"]="NCCL_COMM_ABORT:NCCL communicator aborted — check for straggler node or hardware fault"
+        ["MNNVL topology"]="MNNVL_TOPO_FAIL:NCCL MNNVL topology search failed — memlock=unlimited + stack=unlimited causes 2MB thread stack; fix: ulimit -l 8388608 -s 8192"
+        ["ENOMEM"]="ENOMEM:Memory registration/allocation failed — check memlock limits and available GPU memory"
+        ["invalid alignment"]="CUDA_ALIGN_ERR:CUDA memory alignment error — possible driver/NCCL version incompatibility"
+    )
+
+    local issues_in_logs=false
+
+    while IFS= read -r entry; do
+        local ns pod
+        ns="${entry%%/*}"; pod="${entry#*/}"
+        section "Logs: $ns/$pod"
+
+        local logs
+        # Use --tail=500 to catch patterns even in longer outputs.
+        # For Failed/Error pods, also check --previous (logs from the crashed container instance).
+        local pod_phase
+        pod_phase=$(kubectl get pod -n "$ns" "$pod" -o jsonpath='{.status.phase}' 2>/dev/null || echo "")
+        logs=$(kubectl logs -n "$ns" "$pod" --tail=500 2>/dev/null || echo "")
+        if [[ -z "$logs" || "$pod_phase" == "Failed" ]]; then
+            local prev_logs
+            prev_logs=$(kubectl logs -n "$ns" "$pod" --previous --tail=500 2>/dev/null || echo "")
+            [[ -n "$prev_logs" ]] && logs="$prev_logs"
+        fi
+
+        if [[ -z "$logs" ]]; then
+            info "  No logs available"
+            continue
+        fi
+
+        for pattern in "${!NCCL_PATTERNS[@]}"; do
+            if echo "$logs" | grep -qi "$pattern"; then
+                local meaning="${NCCL_PATTERNS[$pattern]}"
+                local code="${meaning%%:*}"
+                local desc="${meaning#*:}"
+                error "  DETECTED [$code]: $desc"
+                echo "$logs" | { grep -i "$pattern" || true; } | tail -3 | while IFS= read -r logline; do
+                    echo -e "    ${YELLOW}> $logline${RESET}"
+                done
+                issues_in_logs=true
+                ISSUES_FOUND=$((ISSUES_FOUND + 1))
+                add_issue_detail "NCCL log pattern [$code] in pod $pod: $desc → references/error-patterns-quick-ref.md" "P1"
+            fi
+        done
+
+        if echo "$logs" | grep -qiE "BASELINE TEST PASSED|AllReduce SUCCESS|Training complete"; then
+            success "  Pod $pod: completed successfully"
+        fi
+    done <<< "$pod_entries"
+
+    $issues_in_logs || success "No NCCL error patterns found in pod logs"
+}
+
+check_nccl_env_vars() {
+    header "Check 7: NCCL Environment Variable Audit"
+
+    local job_filter=()
+    [[ -n "$JOB_NAME" ]] && job_filter=(-l "job-name=$JOB_NAME")
+
+    local ns="${NAMESPACE:-default}"
+    local first_pod
+    first_pod=$(kubectl get pods -n "$ns" ${job_filter[@]+"${job_filter[@]}"} --no-headers 2>/dev/null \
+        | grep -E " Running " | head -1 | awk '{print $1}' || echo "")
+
+    if [[ -z "$first_pod" ]]; then
+        info "No Running pods found for env var audit (only meaningful during active training)"
+        return
+    fi
+
+    info "Checking env vars in Running pod: $ns/$first_pod"
+    local pod_env
+    pod_env=$(kubectl exec -n "$ns" "$first_pod" -- env 2>/dev/null || echo "")
+
+    if [[ -z "$pod_env" ]]; then
+        warn "Could not exec into $first_pod"
+        return
+    fi
+
+    # Capture Python output so we can parse the sentinel line for issue accounting.
+    local env_audit_out env_warn_count
+    env_audit_out=$(python3 - <<'PYEOF' "$pod_env"
+import sys
+pod_env = sys.argv[1] if len(sys.argv) > 1 else ""
+env_map = {}
+for line in pod_env.strip().split('\n'):
+    if '=' in line:
+        k, _, v = line.partition('=')
+        env_map[k.strip()] = v.strip()
+
+# (rec_value, severity, description)
+# severity WARN = counts as issue; INFO = advisory only
+checks = {
+    'MASTER_ADDR':            (None,          'WARN', 'Must be rank-0 pod hostname/IP'),
+    'MASTER_PORT':            ('29500',       'WARN', 'Must match across all ranks'),
+    'WORLD_SIZE':             (None,          'WARN', 'Must equal total processes'),
+    'RANK':                   (None,          'WARN', 'Must be unique 0..WORLD_SIZE-1'),
+    'NCCL_SOCKET_IFNAME':     ('^lo,docker,efa,veth,virbr', 'WARN', 'Exclude non-VPC interfaces (loopback/docker/EFA control/veth)'),
+    'NCCL_TIMEOUT':           ('1200',        'WARN', 'Default 600s too short for large clusters'),
+    'FI_PROVIDER':            ('efa',         'INFO', 'Set to efa on EFA instances; omit for CPU-only'),
+    'FI_EFA_USE_DEVICE_RDMA': ('1',           'INFO', 'Required for full EFA RDMA performance'),
+    'NCCL_DEBUG':             ('WARN',        'INFO', 'Enable for diagnostics (use WARN not INFO in prod)'),
+}
+
+print("  {:<28} {:<22} {}".format('Variable','Value','Status'))
+print("  " + "-"*68)
+warn_count = 0
+for var,(rec,sev,desc) in checks.items():
+    val = env_map.get(var)
+    if val:
+        print(f"  [SET]  {var:<26} {val:<22}")
+    elif sev == 'WARN':
+        warn_count += 1
+        print(f"  [WARN] {var:<26} {'(not set)':<22}  <- {desc}")
+    else:
+        print(f"  [INFO] {var:<26} {'(not set)':<22}  <- {desc}")
+
+nccl_debug = env_map.get('NCCL_DEBUG', '')
+if nccl_debug.upper() == 'INFO':
+    warn_count += 1
+    print("\n  [WARN] NCCL_DEBUG=INFO detected in production job (10-30% slowdown)")
+elif nccl_debug.upper() == 'TRACE':
+    warn_count += 1
+    print("\n  [WARN] NCCL_DEBUG=TRACE detected (50-80% overhead, gigabytes of logs per rank)")
+
+# NCCL_TIMEOUT value validation (formula: nodes * 5 + 600)
+nccl_timeout_str = env_map.get('NCCL_TIMEOUT', '')
+world_size_str = env_map.get('WORLD_SIZE', '0')
+try:
+    world_size = int(world_size_str)
+except ValueError:
+    world_size = 0
+if nccl_timeout_str and world_size > 0:
+    try:
+        nccl_timeout = int(nccl_timeout_str)
+        recommended = world_size * 5 + 600
+        if nccl_timeout < recommended:
+            warn_count += 1
+            print(f"\n  [WARN] NCCL_TIMEOUT={nccl_timeout}s may be too low for {world_size} ranks (recommended >= {recommended}s)")
+    except ValueError:
+        pass
+
+# Large cluster checks (256+ nodes)
+if world_size > 256:
+    warn_count += 1
+    print(f"\n  [WARN] WORLD_SIZE={world_size} (large cluster) — verify memlock and stack ulimits")
+
+if warn_count == 0:
+    print("\n  [PASS] All critical NCCL env vars configured")
+else:
+    print(f"\n  [WARN] {warn_count} critical NCCL env var(s) not set or misconfigured")
+
+# Sentinel line consumed by the caller — DO NOT remove.
+print(f"__WARN_COUNT__={warn_count}")
+PYEOF
+)
+    echo "$env_audit_out" | grep -v '^__WARN_COUNT__='
+    env_warn_count=$(echo "$env_audit_out" | grep '^__WARN_COUNT__=' | cut -d= -f2)
+    if [[ "${env_warn_count:-0}" =~ ^[0-9]+$ ]] && (( env_warn_count > 0 )); then
+        ISSUES_FOUND=$((ISSUES_FOUND + env_warn_count))
+        add_issue_detail "${env_warn_count} NCCL env var issue(s) in pod ${ns}/${first_pod} → references/operations.md § 7 Environment Variable Quick Reference" "P1"
+    fi
+}
+
+# Check 2b: EFA Device Plugin + NCCL Version Consistency (kubectl — local)
+# These require kubectl but don't need a running job — they check cluster config.
+check_efa_k8s() {
+    header "Check 2b: EFA K8s Device Plugin & NCCL Version Consistency"
+
+    # Without this DaemonSet, pods can't request vpc.amazonaws.com/efa resources
+    # and EFA interfaces won't be mounted into training containers.
+    local efa_ds
+    efa_ds=$(kubectl get daemonset -A 2>/dev/null | grep -iE "efa|aws-efa" | head -3 || echo "")
+
+    if [[ -n "$efa_ds" ]]; then
+        success "EFA device plugin DaemonSet found:"
+        echo "$efa_ds" | while IFS= read -r line; do info "  $line"; done
+    else
+        # Only a problem if any pods are requesting EFA resources
+        local ns_flag=(); if [[ -n "$NAMESPACE" ]]; then ns_flag=(-n "$NAMESPACE"); else ns_flag=(-A); fi
+        local efa_requested
+        efa_requested=$(kubectl get pods "${ns_flag[@]}" -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+for pod in data.get('items', []):
+    for c in pod.get('spec', {}).get('containers', []):
+        lims = c.get('resources', {}).get('limits', {})
+        if 'vpc.amazonaws.com/efa' in lims:
+            ns = pod['metadata']['namespace']
+            name = pod['metadata']['name']
+            count = lims['vpc.amazonaws.com/efa']
+            print(f'  {ns}/{name}: requests {count} EFA interface(s)')
+" 2>/dev/null || echo "")
+
+        if [[ -n "$efa_requested" ]]; then
+            error "Pods request EFA resources but EFA device plugin DaemonSet NOT found!"
+            error "  EFA interfaces will NOT be mounted into training containers"
+            echo "$efa_requested"
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "EFA device plugin DaemonSet missing → references/operations.md § 7 Environment Variable Quick Reference / references/debugging-guide.md § 6 EFA Configuration" "P0"
+        else
+            info "EFA device plugin not detected (OK if no pods request vpc.amazonaws.com/efa)"
+        fi
+    fi
+
+    # Mixed NCCL versions across nodes → 'NCCL function not found' at init.
+    # Two independent probes:
+    #   - torch.cuda.nccl.version(): works only if PyTorch is installed.
+    #   - libnccl.so on disk: authoritative — this is what actually loads at
+    #     runtime, works for any image (PyTorch, JAX, raw NCCL, custom).
+    if [[ -n "$JOB_NAME" ]]; then
+        section "NCCL Version Consistency (job: $JOB_NAME)"
+        local ns="${NAMESPACE:-default}"
+        local job_pods
+        job_pods=$(kubectl get pods -n "$ns" -l "job-name=$JOB_NAME" --no-headers 2>/dev/null \
+            | grep -E " Running " | awk '{print $1}' | head -4 || echo "")
+
+        if [[ -z "$job_pods" ]]; then
+            info "No Running pods in job '$JOB_NAME' — version check skipped"
+        else
+            # Read-only probe: find libnccl.so*, extract embedded version string,
+            # fall back to SONAME filename parsing when `strings` is unavailable.
+            # Variables below are expanded inside the remote pod via `kubectl exec
+            # sh -c`, NOT locally — the quoted heredoc prevents local expansion.
+            local lib_probe
+            lib_probe=$(cat <<'REMOTE_PROBE'
+NCCL_LIB=$(find /usr/local/cuda/lib64 /usr/lib /usr/lib64 /usr/lib/x86_64-linux-gnu /opt/nccl/lib /opt/amazon/ofi-nccl/lib -maxdepth 3 -name "libnccl.so*" -type f 2>/dev/null | head -1)
+if [ -z "$NCCL_LIB" ]; then echo "not-found"; exit 0; fi
+VER=$(strings "$NCCL_LIB" 2>/dev/null | grep -oE "NCCL version [0-9]+\.[0-9]+\.[0-9]+" | head -1 | sed "s/NCCL version //")
+[ -z "$VER" ] && VER=$(basename "$(readlink -f "$NCCL_LIB")" 2>/dev/null | grep -oE "[0-9]+\.[0-9]+\.[0-9]+" | head -1)
+[ -z "$VER" ] && VER="present-no-version"
+echo "$VER"
+REMOTE_PROBE
+)
+            local torch_versions=()
+            local lib_versions=()
+            for pod in $job_pods; do
+                local tver lver
+                tver=$(kubectl exec -n "$ns" "$pod" -- \
+                    python3 -c "import torch; print(torch.cuda.nccl.version())" 2>/dev/null \
+                    || echo "unavailable")
+                lver=$(kubectl exec -n "$ns" "$pod" -- sh -c "$lib_probe" 2>/dev/null \
+                    || echo "unavailable")
+                info "  $pod: torch.nccl=$tver  libnccl.so=$lver"
+                torch_versions+=("$tver")
+                lib_versions+=("$lver")
+            done
+
+            local unique_torch unique_lib
+            unique_torch=$(printf '%s\n' "${torch_versions[@]}" | grep -v unavailable | sort -u | wc -l | tr -d ' ')
+            unique_lib=$(printf '%s\n' "${lib_versions[@]}" \
+                | grep -vE "unavailable|not-found|present-no-version" | sort -u | wc -l | tr -d ' ')
+
+            if [[ "$unique_torch" -gt 1 ]]; then
+                error "NCCL VERSION MISMATCH (torch.cuda.nccl.version) across pods — will cause 'NCCL function not found' at init!"
+                ISSUES_FOUND=$((ISSUES_FOUND + 1))
+                add_issue_detail "NCCL version mismatch across pods (torch) → references/debugging-guide.md § 10 NCCL Version Mismatch" "P1"
+            fi
+            if [[ "$unique_lib" -gt 1 ]]; then
+                error "libnccl.so VERSION MISMATCH across pods — mixed NCCL libraries will cause symbol errors at init!"
+                ISSUES_FOUND=$((ISSUES_FOUND + 1))
+                add_issue_detail "libnccl.so version mismatch across pods → references/debugging-guide.md § 10 NCCL Version Mismatch" "P1"
+            fi
+            if [[ "$unique_torch" -le 1 ]] && [[ "$unique_lib" -le 1 ]]; then
+                if [[ "$unique_torch" -eq 1 ]] || [[ "$unique_lib" -eq 1 ]]; then
+                    success "NCCL version consistent across ${#lib_versions[@]} pod(s)"
+                else
+                    info "NCCL version unavailable (neither torch nor libnccl.so could be probed)"
+                fi
+            fi
+        fi
+    fi
+}
+
+check_network_policies() {
+    header "Check 9: Kubernetes NetworkPolicy Scan"
+
+    local np_flag np_label
+    if [[ -n "$NAMESPACE" ]]; then
+        np_flag=(-n "$NAMESPACE")
+        np_label="'$NAMESPACE'"
+    else
+        np_flag=(-A)
+        np_label="all namespaces"
+    fi
+
+    local policies
+    policies=$(kubectl get networkpolicy "${np_flag[@]}" 2>/dev/null || echo "")
+
+    if [[ -z "$policies" ]] || echo "$policies" | grep -q "No resources found"; then
+        success "No NetworkPolicies in $np_label — all traffic allowed"
+        return
+    fi
+
+    # Informational — only raise a finding when the per-policy scan below
+    # identifies one that actually blocks all ingress/egress. Narrow allow-list
+    # policies (e.g. operator-scoped ingress) are common and not a defect.
+    info "NetworkPolicies found in $np_label — review each for NCCL impact:"
+    echo "$policies"
+
+    local scope_flag
+    local scope_flag=()
+    if [[ -n "$NAMESPACE" ]]; then scope_flag=(-n "$NAMESPACE"); else scope_flag=(-A); fi
+    kubectl get networkpolicy "${scope_flag[@]}" -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+for pol in data.get('items', []):
+    name = pol['metadata']['name']
+    ns   = pol['metadata']['namespace']
+    spec = pol.get('spec', {})
+    types   = spec.get('policyTypes', [])
+    ingress = spec.get('ingress', [])
+    egress  = spec.get('egress', [])
+    print(f'  Policy: {ns}/{name}  |  Types: {types}')
+    if 'Ingress' in types and not ingress:
+        print(f'    [FAIL] BLOCKS ALL INBOUND — will break NCCL rendezvous and AllReduce!')
+    if 'Egress' in types and not egress:
+        print(f'    [FAIL] BLOCKS ALL OUTBOUND — will break NCCL communication!')
+    if ('Ingress' not in types) and ('Egress' not in types):
+        print(f'    [INFO] Policy has no policyTypes — acts as allow-all')
+" 2>/dev/null
+
+    # Emit one finding per blocking policy so each can be reviewed individually.
+    local scope_flag2
+    local scope_flag2=()
+    if [[ -n "$NAMESPACE" ]]; then
+        scope_flag2=(-n "$NAMESPACE")
+    else
+        scope_flag2=(-A)
+    fi
+    local blocking_list
+    blocking_list=$(kubectl get networkpolicy "${scope_flag2[@]}" -o json 2>/dev/null | python3 -c "
+import sys, json
+try:
+    data = json.load(sys.stdin)
+except json.JSONDecodeError:
+    # kubectl returned non-JSON (empty stdin, error text, or version-skew output).
+    # Skip this check rather than aborting the overall diagnostic run.
+    sys.exit(0)
+for pol in data.get('items', []):
+    name = pol['metadata']['name']
+    ns   = pol['metadata']['namespace']
+    spec = pol.get('spec', {})
+    types   = spec.get('policyTypes', [])
+    ingress = spec.get('ingress', [])
+    egress  = spec.get('egress', [])
+    blocks = ('Ingress' in types and not ingress) or ('Egress' in types and not egress)
+    if blocks:
+        print(f'{ns}/{name}')
+" 2>/dev/null || echo "")
+
+    if [[ -n "$blocking_list" ]]; then
+        while IFS= read -r bp; do
+            [[ -z "$bp" ]] && continue
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "Blocking NetworkPolicy $bp may prevent NCCL traffic → references/operations.md § NetworkPolicy" "P1"
+        done <<< "$blocking_list"
+    fi
+}
+
+# Populates SSM_CLUSTER_ID and SSM_NODES array (up to SAMPLE_NODES entries).
+# Each entry is "INSTANCE_ID GROUP_NAME".
+# Prefers worker/compute nodes over controller/head nodes.
+# Respects --node <INSTANCE_ID> if provided.
+resolve_cluster_nodes_for_ssm() {
+    SSM_CLUSTER_ID=""
+    SSM_NODES=()
+
+    local cluster_arn
+    cluster_arn=$(aws sagemaker describe-cluster \
+        --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+        --query 'ClusterArn' --output text 2>/dev/null || echo "")
+
+    if [[ -z "$cluster_arn" || "$cluster_arn" == "None" ]]; then
+        debug "resolve_cluster_nodes_for_ssm: describe-cluster returned empty ARN"
+        return 1
+    fi
+
+    SSM_CLUSTER_ID=$(echo "$cluster_arn" | awk -F'/' '{print $NF}')
+
+    local nodes_json
+    nodes_json=$(sagemaker_list_paginated list-cluster-nodes ClusterNodeSummaries)
+
+    if [[ -n "$NODE_ID" ]]; then
+        local grp
+        grp=$(echo "$nodes_json" | python3 -c "
+import sys, json
+target = sys.argv[1]
+nodes = json.load(sys.stdin).get('ClusterNodeSummaries', [])
+for n in nodes:
+    if n.get('InstanceId') == target:
+        print(n.get('InstanceGroupName','worker'))
+        break
+" "$NODE_ID" 2>/dev/null | head -1)
+        [[ -z "$grp" ]] && grp="worker"
+        SSM_NODES=("$NODE_ID $grp")
+        return 0
+    fi
+
+    local all_nodes
+    all_nodes=$(echo "$nodes_json" | python3 -c "
+import sys, json
+print(json.dumps(json.load(sys.stdin).get('ClusterNodeSummaries', [])))
+" 2>/dev/null || echo '[]')
+
+    # Prefer Running compute nodes; fall back to any Running node if none.
+    local picked
+    picked=$(echo "$all_nodes" | python3 -c "
+import sys, json
+nodes = json.load(sys.stdin)
+sample = $SAMPLE_NODES
+results = []
+for n in nodes:
+    if len(results) >= sample: break
+    g = n.get('InstanceGroupName','').lower()
+    s = n.get('InstanceStatus',{}).get('Status','')
+    if s == 'Running' and not any(x in g for x in ['controller','head','master']):
+        results.append(n['InstanceId'] + ' ' + n['InstanceGroupName'])
+if not results:
+    for n in nodes:
+        if len(results) >= sample: break
+        if n.get('InstanceStatus',{}).get('Status','') == 'Running':
+            results.append(n['InstanceId'] + ' ' + n['InstanceGroupName'])
+for r in results:
+    print(r)
+" 2>/dev/null || echo "")
+
+    if [[ -z "$picked" ]]; then
+        debug "resolve_cluster_nodes_for_ssm: no Running nodes found"
+        return 1
+    fi
+
+    while IFS= read -r line; do
+        [[ -n "$line" ]] && SSM_NODES+=("$line")
+    done <<< "$picked"
+
+    return 0
+}
+
+# Usage: _ssm_run INSTANCE_ID GROUP_NAME CLUSTER_ID SCRIPT_BODY
+# Returns the stdout of the remote script, or empty on failure.
+_ssm_run() {
+    local instance_id="$1"
+    local group_name="$2"
+    local cluster_id="$3"
+    local script_body="$4"
+
+    # Validate inputs before interpolating into the SSM target string.
+    [[ -z "$instance_id" || -z "$group_name" || -z "$cluster_id" || -z "$script_body" ]] && return 1
+    [[ ! "$instance_id" =~ ^i-[0-9a-f]{8,17}$ ]] && return 1
+    [[ ! "$group_name"  =~ ^[A-Za-z0-9._-]+$ ]] && return 1
+    [[ ! "$cluster_id"  =~ ^[A-Za-z0-9._-]+$ ]] && return 1
+
+    local target="sagemaker-cluster:${cluster_id}_${group_name}-${instance_id}"
+
+    local tmpfile
+    tmpfile=$(mktemp "${TMPDIR:-/tmp}/nccl-ssm-XXXXXX.json") || return 1
+    chmod 600 "$tmpfile" 2>/dev/null || true
+    _TEMP_FILES+=("$tmpfile")
+    # AWS-StartNonInteractiveCommand collapses newlines in a single command
+    # element, so embed the multi-line script as a base64 payload.
+    local cmd_b64
+    cmd_b64=$(printf '%s' "$script_body" | base64 | tr -d '\n') || { rm -f "$tmpfile"; return 1; }
+    local remote="bash -c \"echo $cmd_b64 | base64 -d | bash\""
+    python3 -c "import json,sys; print(json.dumps({'command':[sys.argv[1]]}))" "$remote" > "$tmpfile" 2>/dev/null || { rm -f "$tmpfile"; return 1; }
+
+    # Retry transient SSM session errors (EOF, throttling, i/o timeout).
+    # Do NOT retry on AccessDenied / UnauthorizedOperation — those are permanent
+    # IAM denials and retrying wastes 5×15s of sleep on something that won't change.
+    local out attempt=0
+    while (( attempt < 5 )); do
+        out=$(timeout 180 aws ssm start-session \
+            --target "$target" \
+            --region "$REGION" \
+            --document-name AWS-StartNonInteractiveCommand \
+            --parameters "file://$tmpfile" 2>&1 || echo "")
+        # Fatal (don't retry) — permanent IAM or agent state.
+        if echo "$out" | grep -qiE "AccessDenied|UnauthorizedOperation|not authorized to perform|TargetNotConnected"; then
+            break
+        fi
+        if ! echo "$out" | grep -qiE "Cannot perform start session|EOF$|SessionManagerPlugin is not found|i/o timeout|ThrottlingException|RequestLimitExceeded|InternalFailure|ServiceUnavailable"; then
+            break
+        fi
+        attempt=$((attempt + 1))
+        sleep $((attempt * 3))
+    done
+    rm -f "$tmpfile"
+    # Strip SSM session banners and the echoed base64 command line.
+    echo "$out" | grep -vE '^(Starting session with SessionId:|Exiting session with sessionId:|\s*$)' \
+                | grep -vE "^(bash -c \"echo [A-Za-z0-9+/=]+ \| base64 -d \| bash\"|echo '[A-Za-z0-9+/=]+'|[A-Za-z0-9+/=]{40,}={0,2})[[:space:]]*\|?[[:space:]]*base64?[[:space:]]*-?d?[[:space:]]*\|?[[:space:]]*bash\"?\$" || true
+}
+
+# Self-contained bash script executed on each HyperPod compute node via SSM.
+# Covers GPU, EFA, NCCL library, network, memory, and process health.
+_NODE_DIAG_SCRIPT=$(cat <<'NODE_SCRIPT'
+#!/bin/bash
+# HyperPod NCCL Node Hardware Diagnostics
+# Runs ON the compute node via SSM — NOT on the local machine.
+export PATH="/opt/amazon/efa/bin:/usr/local/cuda/bin:$PATH"
+
+echo "=== NODE DIAGNOSTICS ==="
+echo "Host: $(hostname)"
+echo "Date: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
+echo "Kernel: $(uname -r)"
+
+# Instance type via IMDS (v2)
+IMDS_TOKEN=$(curl -sf -m 3 -X PUT "http://169.254.169.254/latest/api/token" \
+    -H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null || echo "")
+if [ -n "$IMDS_TOKEN" ]; then
+    INSTANCE_TYPE=$(curl -sf -m 3 -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
+        "http://169.254.169.254/latest/meta-data/instance-type" 2>/dev/null || echo "unknown")
+    AZ=$(curl -sf -m 3 -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
+        "http://169.254.169.254/latest/meta-data/placement/availability-zone" 2>/dev/null || echo "unknown")
+else
+    INSTANCE_TYPE=$(curl -sf -m 3 "http://169.254.169.254/latest/meta-data/instance-type" 2>/dev/null || echo "unknown")
+    AZ="unknown"
+fi
+echo "Instance: ${INSTANCE_TYPE} | AZ: ${AZ}"
+echo ""
+
+echo "--- GPU ---"
+# Require both the binary AND at least one GPU visible. nvidia-smi is preinstalled
+# on some non-GPU instance types (t3/c5) but returns "No devices were found" —
+# reporting that as [FAIL] would be a false positive on controllers/logins.
+if command -v nvidia-smi &>/dev/null && nvidia-smi -L 2>/dev/null | grep -q "^GPU"; then
+    nvidia-smi --query-gpu=index,name,driver_version,memory.used,memory.total,temperature.gpu,utilization.gpu \
+        --format=csv,noheader 2>/dev/null \
+        && echo "" \
+        || echo "[FAIL] nvidia-smi query failed"
+
+    # XID errors indicate hardware faults that will cause NCCL to abort.
+    # Modern A100/H100 drivers log XIDs to dmesg but NOT to nvidia-smi -q,
+    # so check both sources — verified on-hardware with A100 driver 580.126
+    # where an injected XID 31 appeared in dmesg but was invisible to -q.
+    XID_DMESG=$(dmesg 2>/dev/null | grep -E 'NVRM: Xid' | tail -5)
+    XID_SMI=$(nvidia-smi -q 2>/dev/null | grep -E '^[[:space:]]*Xid' | head -5)
+    if [ -n "$XID_DMESG" ] || [ -n "$XID_SMI" ]; then
+        echo "[FAIL] GPU XID ERRORS DETECTED (hardware fault — NCCL will abort):"
+        [ -n "$XID_DMESG" ] && echo "$XID_DMESG"
+        [ -n "$XID_SMI" ] && echo "$XID_SMI"
+    else
+        echo "[PASS] No GPU XID errors"
+    fi
+
+    # Only surface nonzero ECC counts. 'ECC Errors' section header and
+    # 'Uncorrectable ... : 0' lines fire on every healthy GPU.
+    ECC=$(nvidia-smi -q 2>/dev/null | awk '
+        /Uncorrectable/ { if ($NF ~ /^[0-9]+$/ && $NF+0 > 0) print }
+    ' | head -5)
+    [ -n "$ECC" ] && echo "[FAIL] GPU uncorrectable ECC errors detected: $ECC" || echo "[PASS] No ECC errors"
+
+    # Row-remap state — marginal GPU memory. Pending rows need a reset to finalize;
+    # Failed means exceeded remap capacity (bad memory). Silent degrader that
+    # default DCGM medium + memtest in some driver versions miss entirely.
+    REMAP=$(nvidia-smi --query-remapped-rows=gpu_bus_id,remapped_rows.pending,remapped_rows.failure \
+        --format=csv,noheader 2>/dev/null)
+    if [ -n "$REMAP" ]; then
+        PENDING_SUM=$(echo "$REMAP" | awk -F, '{gsub(/ /,""); s+=$2} END {print s+0}')
+        FAILED_COUNT=$(echo "$REMAP" | awk -F, '{gsub(/ /,""); if ($3=="Yes" || $3=="1") c++} END {print c+0}')
+        if [ "$FAILED_COUNT" -gt 0 ]; then
+            echo "[FAIL] GPU row-remap FAILED on $FAILED_COUNT device(s) — bad memory, replace GPU"
+        elif [ "$PENDING_SUM" -gt 0 ]; then
+            echo "[FAIL] GPU row-remap PENDING ($PENDING_SUM row(s)) — marginal memory; reset/reboot to finalize"
+            echo "       If pending persists across reboots, firmware may be stuck — replace GPU"
+        else
+            echo "[PASS] GPU row-remap: no pending or failed rows"
+        fi
+    fi
+
+    # DCGM health — complements XID/ECC above. Parse Fail/Warn verdicts only
+    # (Pass is not authoritative on DCGM <= 3.3.9 due to memtest bug).
+    if command -v dcgmi >/dev/null 2>&1; then
+        DCGM_OUT=$(dcgmi health --check -j 2>/dev/null || dcgmi health --check 2>/dev/null || echo "")
+        if echo "$DCGM_OUT" | grep -qiE '"overall_health"\s*:\s*"(Fail|Warn)"|HEALTH_RESULT_FAIL|HEALTH_RESULT_WARN|Health Monitor Report.*(Fail|Warn)'; then
+            echo "[FAIL] DCGM health check reports Fail/Warn — inspect with 'dcgmi health --check'"
+        fi
+    fi
+
+    # DCGM nvvs log presence — HyperPod deep-health-check writes here.
+    if [ -d /var/log/nvidia-dcgm ]; then
+        NVVS_LATEST=$(find /var/log/nvidia-dcgm -maxdepth 1 -name 'nvvs*.log' -printf '%T@ %p\n' 2>/dev/null | sort -nr | head -1 | awk '{print $2}')
+        if [ -n "$NVVS_LATEST" ]; then
+            if tail -n 200 "$NVVS_LATEST" 2>/dev/null | grep -qiE 'row ?remap.*(pending|fail)|FAIL: |Error: '; then
+                echo "[FAIL] DCGM nvvs log contains failure / row-remap signals: $NVVS_LATEST"
+            fi
+        fi
+    fi
+
+    # NVLink — important for p4d/p5 multi-GPU NCCL bandwidth.
+    # Output format across driver versions:
+    #   - 'Link N: X GB/s'   (active, driver 470+)
+    #   - 'Link N: Active'   (older drivers)
+    #   - 'error'/'fail'/'inactive' keywords when degraded
+    NVLINK=$(nvidia-smi nvlink --status 2>/dev/null | head -200)
+    if echo "$NVLINK" | grep -qiE "error|fail|inactive"; then
+        echo "[FAIL] NVLink errors/inactive links detected (replace node):"
+        echo "$NVLINK" | grep -iE "error|fail|inactive"
+    else
+        ACTIVE_COUNT=$(echo "$NVLINK" | grep -cE "Link [0-9]+:[[:space:]]+([0-9]+ GB/s|Active)" || true)
+        if [ "${ACTIVE_COUNT:-0}" -gt 0 ]; then
+            echo "[PASS] NVLink: $ACTIVE_COUNT active link(s)"
+        else
+            echo "[INFO] NVLink not available (expected on single-GPU or non-NVLink instances)"
+        fi
+    fi
+
+    # GPU P2P topology — critical for intra-node NCCL AllReduce performance
+    echo ""
+    echo "--- GPU P2P Topology (nvidia-smi topo) ---"
+    nvidia-smi topo -m 2>/dev/null | head -25 | while IFS= read -r line; do
+        if echo "$line" | grep -qiE "NV[0-9]|NVLink"; then
+            echo "  [PASS] $line"
+        elif echo "$line" | grep -qiE "PIX|PXB|PHB|SOC"; then
+            echo "  [WARN] $line  <- PCIe path (slower than NVLink)"
+        else
+            echo "  [INFO] $line"
+        fi
+    done
+
+    # PCI ACS — intercepts GPU Direct P2P → 10-50x slower intra-node AllReduce or hang
+    echo ""
+    echo "--- PCI ACS (Access Control Services) ---"
+    if command -v lspci &>/dev/null; then
+        ACS_ENABLED=$(lspci -vvv 2>/dev/null | grep -A20 "PCI bridge\|Root Port\|Upstream Port" \
+            | grep "ACSCtl:" | { grep -c "SrcValid+" 2>/dev/null; true; })
+        if [ "$ACS_ENABLED" -gt 0 ] 2>/dev/null; then
+            echo "[FAIL] ACS enabled on $ACS_ENABLED PCI bridge(s) — GPU Direct P2P blocked!"
+            echo "       Symptom: 'NCCL WARN P2P not supported between dev X and dev Y'"
+            echo "       Impact:  10-50x slower intra-node AllReduce"
+        else
+            echo "[PASS] ACS not enabled on PCI bridges — GPU Direct P2P unobstructed"
+        fi
+    else
+        echo "[INFO] lspci not available — install pciutils to check ACS"
+    fi
+
+    IOMMU=$(dmesg 2>/dev/null | grep -iE "iommu.*enabled|dmar.*enabled" | head -2 || \
+            grep -oE "intel_iommu=[^ ]+|iommu=[^ ]+" /proc/cmdline 2>/dev/null | head -1 || echo "")
+    if [ -n "$IOMMU" ]; then
+        echo "[WARN] IOMMU may be enabled: $IOMMU"
+        echo "       On baremetal: disable VT-d/IOMMU in BIOS for best GPU Direct P2P"
+        echo "       In VMs: normal — use ATS on network adapters"
+    else
+        echo "[PASS] IOMMU: not detected as enabled"
+    fi
+
+    [ "${NCCL_P2P_DISABLE:-0}" = "1" ] && \
+        echo "[WARN] NCCL_P2P_DISABLE=1 set — workaround active, performance degraded" || true
+
+    # nvidia-peermem — GPU Direct RDMA to NIC (required for EFA↔GPU on p4d/p5)
+    echo ""
+    echo "--- nvidia-peermem (GPU Direct RDMA) ---"
+    if lsmod 2>/dev/null | grep -q "nvidia_peermem\|nv_peer_mem"; then
+        echo "[PASS] nvidia-peermem loaded — GPU Direct RDMA to EFA/NIC enabled"
+    else
+        # Check if kernel supports DMA-BUF (replacement for nvidia-peermem on 5.12+)
+        KVER_MAJOR=$(uname -r | cut -d. -f1)
+        KVER_MINOR=$(uname -r | cut -d. -f2)
+        if [ "$KVER_MAJOR" -ge 5 ] && [ "$KVER_MINOR" -ge 12 ] 2>/dev/null; then
+            echo "[INFO] nvidia-peermem not loaded; kernel $(uname -r) supports DMA-BUF (auto-detected)"
+        else
+            echo "[WARN] nvidia-peermem NOT loaded — EFA↔GPU copies go through CPU"
+        fi
+    fi
+else
+    if command -v nvidia-smi &>/dev/null; then
+        echo "[INFO] nvidia-smi installed but no GPU devices visible — likely a CPU-only node (controller/login)"
+    else
+        echo "[INFO] nvidia-smi not found — CPU-only node or GPU driver not installed"
+    fi
+fi
+echo ""
+
+echo "--- EFA ---"
+
+if lsmod 2>/dev/null | grep -q "^efa "; then
+    EFA_MOD_VER=$(modinfo efa 2>/dev/null | grep "^version:" | awk '{print $2}' || echo "unknown")
+    echo "[PASS] EFA kernel module loaded (version: ${EFA_MOD_VER})"
+else
+    EFA_DEVS=$(ls /dev/infiniband/uverbs* 2>/dev/null || echo "")
+    EFA_IFACES=$(ip -br link show 2>/dev/null | grep -cE "^efa" || echo 0)
+    if [ -n "$EFA_DEVS" ] || [ "$EFA_IFACES" -gt 0 ] 2>/dev/null; then
+        echo "[FAIL] EFA devices present but kernel module NOT loaded — NCCL EFA will fail"
+    else
+        echo "[INFO] EFA kernel module not loaded (expected on non-EFA instances)"
+    fi
+fi
+
+FI_CMD=""
+command -v fi_info &>/dev/null && FI_CMD="fi_info"
+[ -z "$FI_CMD" ] && [ -f /opt/amazon/efa/bin/fi_info ] && FI_CMD="/opt/amazon/efa/bin/fi_info"
+
+if [ -n "$FI_CMD" ]; then
+    EFA_OUTPUT=$($FI_CMD -p efa 2>&1)
+    if echo "$EFA_OUTPUT" | grep -q "provider: efa"; then
+        EFA_COUNT=$(echo "$EFA_OUTPUT" | { grep -c "provider: efa" 2>/dev/null; true; })
+        echo "[PASS] EFA provider available: $EFA_COUNT interface(s)"
+        echo "$EFA_OUTPUT" | grep "device:" | head -5
+
+        # Validate EFA count against expected per-instance-type counts. A subset
+        # of NICs silently failing to attach is a top NCCL failure mode (training
+        # runs at reduced bandwidth with no error). Counts per AWS EC2 docs.
+        IMDS_TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
+            -H "X-aws-ec2-metadata-token-ttl-seconds: 60" --connect-timeout 2 2>/dev/null || echo "")
+        if [ -n "$IMDS_TOKEN" ]; then
+            INST_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
+                http://169.254.169.254/latest/meta-data/instance-type --connect-timeout 2 2>/dev/null || echo "")
+            case "$INST_TYPE" in
+                p5.48xlarge|p5e.48xlarge) EXPECTED_EFA=32 ;;
+                p5en.48xlarge)            EXPECTED_EFA=16 ;;
+                p4d.24xlarge|p4de.24xlarge) EXPECTED_EFA=4 ;;
+                trn1.32xlarge)            EXPECTED_EFA=8 ;;
+                trn2.48xlarge)            EXPECTED_EFA=16 ;;
+                *)                        EXPECTED_EFA=0 ;;
+            esac
+            if [ "$EXPECTED_EFA" -gt 0 ] 2>/dev/null; then
+                if [ "$EFA_COUNT" -lt "$EXPECTED_EFA" ] 2>/dev/null; then
+                    echo "[FAIL] EFA count mismatch on ${INST_TYPE}: got ${EFA_COUNT}, expected ${EXPECTED_EFA}"
+                    echo "       A subset of NICs failed to attach — NCCL will run at reduced bandwidth"
+                else
+                    echo "[PASS] EFA count matches ${INST_TYPE} expected value (${EXPECTED_EFA})"
+                fi
+            else
+                echo "[INFO] EFA count validation skipped — no expected value for ${INST_TYPE:-unknown}"
+            fi
+        fi
+    else
+        # Determine whether EFA is expected — absence on non-EFA instance types
+        # (t3, c5, controllers) is normal, not a failure.
+        INST_TYPE_CHECK="${INST_TYPE:-}"
+        case "$INST_TYPE_CHECK" in
+            p4d.*|p4de.*|p5.*|p5e.*|p5en.*|p6*|trn1.*|trn2.*)
+                echo "[FAIL] EFA provider NOT available on ${INST_TYPE_CHECK}"
+                echo "  fi_info -p efa returned no results"
+                echo "  Required for NCCL on this instance type — training will fall back to TCP (very slow)"
+                ;;
+            *)
+                echo "[INFO] EFA provider not available — expected on non-EFA instance type (${INST_TYPE_CHECK:-unknown})"
+                ;;
+        esac
+    fi
+    TCP_COUNT=$($FI_CMD -p tcp 2>/dev/null | { grep -c "provider: tcp" 2>/dev/null; true; })
+    LF_VER=$($FI_CMD --version 2>&1 | grep libfabric | sed 's/.*: //' | head -1)
+    echo "  libfabric: ${LF_VER:-unknown}  |  TCP fallback endpoints: $TCP_COUNT"
+else
+    echo "[INFO] fi_info not found — EFA tools not installed (OK for non-EFA instances)"
+fi
+
+[ -f /opt/amazon/efa_installed_packages ] && \
+    grep "# EFA installer version" /opt/amazon/efa_installed_packages | head -1 \
+    || echo "[INFO] /opt/amazon/efa_installed_packages not found"
+
+# aws-ofi-nccl — bridges NCCL and EFA, required for GPU training on EFA instances
+OFI_LIB=$(find /opt/amazon/ofi-nccl /usr/local/lib /usr/lib /opt/aws-ofi-nccl/lib \
+    -maxdepth 4 -name "libnccl-net.so" 2>/dev/null | head -1)
+NCCL_NET_PLUGIN_ENV="${NCCL_NET_PLUGIN:-}"
+if [ -n "$NCCL_NET_PLUGIN_ENV" ]; then
+    [ -f "$NCCL_NET_PLUGIN_ENV" ] && \
+        echo "[PASS] NCCL_NET_PLUGIN=$NCCL_NET_PLUGIN_ENV (file exists)" || \
+        echo "[FAIL] NCCL_NET_PLUGIN=$NCCL_NET_PLUGIN_ENV — FILE NOT FOUND! NCCL EFA will fail"
+elif [ -n "$OFI_LIB" ]; then
+    echo "[PASS] aws-ofi-nccl plugin: $OFI_LIB"
+else
+    # Only a FAIL if FI_PROVIDER=efa is set — otherwise just informational
+    [ "${FI_PROVIDER:-}" = "efa" ] && \
+        echo "[FAIL] FI_PROVIDER=efa but aws-ofi-nccl plugin not found — NCCL EFA will fail" || \
+        echo "[INFO] aws-ofi-nccl not found (required for EFA+NCCL; not needed for CPU-only)"
+fi
+
+# Hugepages — improve EFA/RDMA memory registration performance
+HP_2M=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages 2>/dev/null || echo 0)
+HP_1G=$(cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages 2>/dev/null || echo 0)
+if [ "$HP_2M" -gt 0 ] 2>/dev/null; then
+    HP_FREE=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages 2>/dev/null || echo 0)
+    echo "[PASS] 2MB hugepages: ${HP_2M} total, ${HP_FREE} free"
+    [ "$HP_FREE" -eq 0 ] && echo "[WARN] All hugepages in use — RDMA may have reduced performance"
+elif [ "$HP_1G" -gt 0 ] 2>/dev/null; then
+    echo "[PASS] 1GB hugepages: ${HP_1G} allocated"
+else
+    echo "[INFO] No hugepages configured (set vm.nr_hugepages=512 for optimal EFA RDMA)"
+fi
+echo ""
+
+echo "--- NCCL ---"
+NCCL_LIB=$(find /usr/local/cuda*/lib* /usr/lib /opt/nccl/lib 2>/dev/null \
+    -maxdepth 4 -name "libnccl.so*" 2>/dev/null | head -3)
+if [ -n "$NCCL_LIB" ]; then
+    echo "[PASS] NCCL library found:"
+    echo "$NCCL_LIB" | while read -r l; do echo "  $l"; done
+else
+    echo "[INFO] NCCL library not found (install NCCL for distributed GPU training)"
+fi
+
+NCCL_HDR=$(find /usr/local/cuda*/include /usr/include /opt/nccl/include 2>/dev/null \
+    -maxdepth 3 -name "nccl.h" 2>/dev/null | head -1)
+if [ -n "$NCCL_HDR" ]; then
+    NCCL_VER=$(grep -E "NCCL_MAJOR|NCCL_MINOR|NCCL_PATCH" "$NCCL_HDR" 2>/dev/null \
+        | awk '{print $3}' | tr '\n' '.' | sed 's/\.$//')
+    [ -n "$NCCL_VER" ] && echo "  NCCL version: $NCCL_VER"
+fi
+
+CUDA_DRV=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1 | tr -d ' ' || echo "")
+if [ -n "$CUDA_DRV" ] && [ -n "$NCCL_VER" ]; then
+    DRV_MAJOR=$(echo "$CUDA_DRV" | cut -d. -f1)
+    NCCL_MAJOR=$(echo "$NCCL_VER" | cut -d. -f1)
+    # NCCL 2.20+ requires CUDA driver >= 525
+    if [ "$NCCL_MAJOR" -ge 2 ] && [ "$DRV_MAJOR" -gt 0 ] && [ "$DRV_MAJOR" -lt 525 ] 2>/dev/null; then
+        echo "[WARN] NCCL $NCCL_VER may require CUDA driver >= 525; found $CUDA_DRV"
+        echo "       Symptom: 'NCCL function not found' on mixed-version nodes"
+    fi
+fi
+echo ""
+
+echo "--- Network Interfaces ---"
+ip -br addr show 2>/dev/null | while IFS= read -r line; do
+    IFACE=$(echo "$line" | awk '{print $1}')
+    STATE=$(echo "$line" | awk '{print $2}')
+    ADDR=$(echo "$line" | awk '{print $3}')
+    if   echo "$IFACE" | grep -q "^lo";                       then TYPE="loopback"
+    elif echo "$IFACE" | grep -qE "^efa|^rdma";               then TYPE="EFA device"
+    elif echo "$IFACE" | grep -qE "^ib[0-9]";                 then TYPE="InfiniBand"
+    elif echo "$IFACE" | grep -qE "^eth|^ens|^enp|^en[0-9]"; then TYPE="VPC ENI"
+    elif echo "$IFACE" | grep -qE "^docker|^br-|^veth";       then TYPE="container bridge"
+    else TYPE="other"; fi
+    printf "  %-18s %-8s %-20s (%s)\n" "$IFACE" "$STATE" "${ADDR:--}" "$TYPE"
+done
+echo ""
+
+echo "--- MTU ---"
+ip -br link show 2>/dev/null | grep -v "^lo" | while IFS= read -r line; do
+    IFACE=$(echo "$line" | awk '{print $1}')
+    MTU=$(ip link show "$IFACE" 2>/dev/null | grep -o "mtu [0-9]*" | awk '{print $2}')
+    [ -z "$MTU" ] && continue
+    if   echo "$IFACE" | grep -qE "docker|br-|veth"; then echo "  [INFO] $IFACE: MTU=$MTU (container bridge — OK)"
+    elif [ "$MTU" -ge 9000 ] 2>/dev/null;             then echo "  [PASS] $IFACE: MTU=$MTU (jumbo frames — optimal for EFA)"
+    else echo "  [WARN] $IFACE: MTU=$MTU — expected 9001 for EFA/RDMA (fragmentation risk for large tensors)"; fi
+done
+echo ""
+
+echo "--- Memory & Limits ---"
+free -h
+echo ""
+
+SHM_SIZE=$(df -BG /dev/shm 2>/dev/null | tail -1 | awk '{print $2}' | tr -d 'G')
+SHM_FS=$(df -T /dev/shm 2>/dev/null | tail -1 | awk '{print $2}' || echo "unknown")
+if [ -n "$SHM_SIZE" ] && [ "$SHM_SIZE" -ge 1 ] 2>/dev/null; then
+    echo "[PASS] /dev/shm: ${SHM_SIZE}GB (fs: ${SHM_FS})"
+    [ "$SHM_SIZE" -lt 4 ] 2>/dev/null && \
+        echo "[WARN] /dev/shm ${SHM_SIZE}GB < 4GB — consider 4GB+ for large model training"
+else
+    echo "[FAIL] /dev/shm: ${SHM_SIZE:-0}GB — NCCL needs ≥1GB (K8s default=64MB)"
+    echo "       Symptom: 'failed to extend /dev/shm/nccl-*' or Bus error"
+fi
+[ "$SHM_FS" != "tmpfs" ] && [ "$SHM_FS" != "unknown" ] && \
+    echo "[WARN] /dev/shm fs type: $SHM_FS (expected tmpfs)"
+
+MEMLOCK=$(ulimit -l 2>/dev/null || echo "unknown")
+if [ "$MEMLOCK" = "0" ]; then
+    echo "[FAIL] memlock=0 — InfiniBand/EFA RDMA memory registration WILL FAIL"
+    echo "       Symptom: 'NCCL WARN Call to ibv_reg_mr failed'"
+elif [ -n "$MEMLOCK" ] && [ "$MEMLOCK" != "unlimited" ] && [ "$MEMLOCK" -ge 8388608 ] 2>/dev/null; then
+    echo "[PASS] memlock=${MEMLOCK}KB (≥8GB — OK)"
+elif [ "$MEMLOCK" = "unlimited" ]; then
+    echo "[INFO] memlock=unlimited (OK for RDMA; see stack check below for libc quirk)"
+else
+    echo "[INFO] memlock=${MEMLOCK}KB"
+fi
+
+# Stack size — GNU libc quirk: when memlock=unlimited, thread stack is reduced to 2MB.
+# NCCL topology graph search (especially MNNVL on 256+ nodes) needs 8MB+ stack.
+STACK=$(ulimit -s 2>/dev/null || echo "unknown")
+if [ "$MEMLOCK" = "unlimited" ] && [ "$STACK" = "unlimited" ]; then
+    echo "[WARN] memlock=unlimited + stack=unlimited — GNU libc reduces NCCL thread stack to 2MB"
+    echo "       NCCL MNNVL/large topology graph search needs 8MB+ and will fail"
+elif [ "$STACK" = "unlimited" ]; then
+    echo "[PASS] stack=unlimited (memlock is bounded, so libc quirk does not apply)"
+elif [ "$STACK" != "unknown" ] && [ "$STACK" -lt 4096 ] 2>/dev/null; then
+    echo "[FAIL] stack=${STACK}KB — too small for NCCL topology search (need ≥4096KB)"
+else
+    echo "[PASS] stack=${STACK:-unknown}KB"
+fi
+
+# systemd RemoveIPC — deletes NCCL shm files when session ends (Slurm nodes)
+if [ -f /etc/systemd/logind.conf ]; then
+    REMOVEIPC=$(grep -i "RemoveIPC" /etc/systemd/logind.conf 2>/dev/null | tail -1 || echo "")
+    if echo "$REMOVEIPC" | grep -qi "yes\|true\|1" || [ -z "$REMOVEIPC" ]; then
+        echo "[WARN] systemd RemoveIPC=yes (or unset, defaults to yes on RHEL/Amazon Linux)"
+        echo "       Symptom: 'unlink shared memory /dev/shm/nccl-* failed: No such file'"
+    else
+        echo "[PASS] systemd RemoveIPC=no — NCCL shm files will not be deleted"
+    fi
+fi
+
+# cuMem NUMA (NCCL 2.23+)
+NUMA_NODES=$(ls /sys/devices/system/node/ 2>/dev/null | { grep -c "^node[0-9]" 2>/dev/null; true; })
+if [ "$NUMA_NODES" -gt 0 ] 2>/dev/null; then
+    echo "[PASS] NUMA topology: $NUMA_NODES node(s) visible (cuMem host alloc OK)"
+else
+    echo "[WARN] NUMA topology not visible — cuMem host allocations may fail"
+fi
+echo ""
+
+echo "--- NCCL RAS (port 28028) ---"
+NC_CMD=$(command -v nc 2>/dev/null || command -v ncat 2>/dev/null || echo "")
+if [ -n "$NC_CMD" ]; then
+    RAS=$(echo "status" | timeout 3 $NC_CMD -w 2 localhost 28028 2>/dev/null || echo "")
+    if [ -n "$RAS" ]; then
+        echo "[PASS] NCCL RAS is alive (training job running):"
+        echo "$RAS" | head -10
+    else
+        echo "[INFO] NCCL RAS not responding on port 28028 — normal when no training job is running"
+    fi
+else
+    echo "[INFO] nc/ncat not found — cannot probe NCCL RAS"
+fi
+echo ""
+
+echo "--- Active Training Processes ---"
+PROCS=$(ps aux 2>/dev/null | grep -E "python|torchrun|mpirun|nccl_test" | grep -v grep | head -10)
+if [ -n "$PROCS" ]; then
+    echo "$PROCS"
+else
+    echo "[INFO] No active training processes"
+fi
+echo ""
+
+echo "--- Recent Hardware Errors (dmesg) ---"
+DMESG=$(dmesg 2>/dev/null | grep -iE "xid|nvrm|efa|ib_core|rdma|correctable|uncorrectable|acs|iommu" \
+    | tail -20 || echo "")
+if [ -n "$DMESG" ]; then
+    echo "$DMESG"
+else
+    echo "[PASS] No hardware errors in dmesg"
+fi
+
+# iptables / nftables — host-level firewall rules that block NCCL
+echo "--- Host Firewall (iptables/nftables) ---"
+IPT_DROP=0
+if command -v iptables &>/dev/null; then
+    IPT_DROP=$(iptables -L -n 2>/dev/null | grep -cE "DROP|REJECT" || echo 0)
+    if [ "$IPT_DROP" -gt 0 ] 2>/dev/null; then
+        echo "[WARN] iptables has $IPT_DROP DROP/REJECT rules — may block NCCL traffic"
+        iptables -L -n 2>/dev/null | grep -E "DROP|REJECT" | head -5
+        echo "       Verify NCCL ports (29400-29500, RDMA) are not blocked"
+    else
+        echo "[PASS] iptables: no DROP/REJECT rules"
+    fi
+elif command -v nft &>/dev/null; then
+    NFT_DROP=$(nft list ruleset 2>/dev/null | grep -cE "drop|reject" || echo 0)
+    if [ "$NFT_DROP" -gt 0 ] 2>/dev/null; then
+        echo "[WARN] nftables has $NFT_DROP drop/reject rules — may block NCCL traffic"
+    else
+        echo "[PASS] nftables: no drop/reject rules"
+    fi
+else
+    echo "[INFO] iptables/nftables not found"
+fi
+echo ""
+
+echo "--- Stale NCCL Shared Memory ---"
+STALE_SHM=$(ls /dev/shm/nccl-* 2>/dev/null || echo "")
+if [ -n "$STALE_SHM" ]; then
+    STALE_COUNT=$(echo "$STALE_SHM" | wc -l)
+    echo "[WARN] $STALE_COUNT stale NCCL shared memory file(s) found:"
+    echo "$STALE_SHM" | head -5
+    echo "       From a previous training run — may cause 'file exists' errors"
+else
+    echo "[PASS] No stale NCCL shared memory files"
+fi
+echo ""
+
+# EFA Latency Check (fi_ping) — catches degraded EFA ports (straggler #1 cause)
+echo "--- EFA Latency (fi_ping self-test) ---"
+FI_PING_CMD=""
+command -v fi_ping &>/dev/null && FI_PING_CMD="fi_ping"
+[ -z "$FI_PING_CMD" ] && [ -f /opt/amazon/efa/bin/fi_ping ] && FI_PING_CMD="/opt/amazon/efa/bin/fi_ping"
+
+if [ -n "$FI_PING_CMD" ]; then
+    # Self-ping on loopback — tests EFA stack without needing a second node
+    # A degraded EFA port shows high latency (>20us) even on self-ping
+    # Validate FI_PING_CMD is a known safe EFA binary path (not user-controlled)
+    if [[ ! "$FI_PING_CMD" =~ ^(/opt/amazon/efa/bin/fi_ping|fi_ping)$ ]]; then
+        echo "[SKIP] fi_ping path not recognised: $FI_PING_CMD"
+    else
+        # nosemgrep: ai.ai-best-practices.hooks-dns-exfiltration.hooks-dns-exfiltration.hooks-dns-exfiltration-generic -- FI_PING_CMD validated to known EFA binary path above; targets loopback 127.0.0.1
+        PING_OUT=$($FI_PING_CMD -p efa -I 10 127.0.0.1 2>/dev/null || \
+                   $FI_PING_CMD -p tcp -I 10 127.0.0.1 2>/dev/null || echo "")
+        if [ -n "$PING_OUT" ]; then
+            LATENCY=$(echo "$PING_OUT" | grep -oE "[0-9]+\.[0-9]+ us" | tail -1 || echo "")
+            LAT_VAL=$(echo "$LATENCY" | grep -oE "[0-9]+" | head -1 || echo 0)
+            if [ -n "$LATENCY" ]; then
+                if [ "$LAT_VAL" -gt 20 ] 2>/dev/null; then
+                    echo "[WARN] fi_ping latency: $LATENCY (>20us — EFA port may be degraded; normal is 1-5us)"
+                    echo "       Impact: straggler AllReduce, training much slower than expected"
+                    echo "       Action: drain this node and replace via HyperPod API"
+                else
+                    echo "[PASS] fi_ping latency: $LATENCY (normal)"
+                fi
+            else
+                echo "[INFO] fi_ping ran but no latency value extracted"
+                echo "$PING_OUT" | tail -3
+            fi
+        else
+            echo "[INFO] fi_ping self-test skipped (no EFA/TCP provider reachable)"
+        fi
+    fi
+else
+    echo "[INFO] fi_ping not found (install EFA tools for latency testing)"
+fi
+echo ""
+
+echo "=== END NODE DIAGNOSTICS ==="
+NODE_SCRIPT
+)
+
+# Strategy for 100s of nodes:
+#   1. Resolve all Running compute nodes via HyperPod API (paginated)
+#   2. Sample --sample-nodes (default 3) for SSM hardware checks
+#   3. Each SSM call has a 60s timeout
+#   4. Results show per-node summary; failures are highlighted
+#   5. This check does NOT increment ISSUES_FOUND (hardware checks are advisory)
+#      unless a critical hardware fault is detected (XID errors, EFA fail on GPU instance)
+check_node_hardware_via_ssm() {
+    header "Check 8: Node Hardware Checks (via SSM — runs ON cluster nodes)"
+
+    info "Resolving cluster nodes for SSM..."
+    if ! resolve_cluster_nodes_for_ssm; then
+        info "Could not resolve cluster nodes via HyperPod API"
+        info "  (DescribeCluster needs sagemaker:DescribeCluster on this cluster)"
+        info "  To check a specific node: --node <INSTANCE_ID>"
+        return
+    fi
+
+    if [[ ${#SSM_NODES[@]} -eq 0 ]]; then
+        info "No Running compute nodes found in cluster"
+        return
+    fi
+
+    local total_nodes
+    total_nodes="${#SSM_NODES[@]}"
+    info "Sampling $total_nodes node(s) for hardware checks (use --sample-nodes N for more)"
+    info "Cluster ID: $SSM_CLUSTER_ID"
+
+    local node_pass=0 node_warn=0 node_fail=0
+
+    for entry in "${SSM_NODES[@]}"; do
+        local instance_id group_name
+        instance_id=$(echo "$entry" | awk '{print $1}')
+        group_name=$(echo "$entry" | awk '{print $2}')
+        local target="sagemaker-cluster:${SSM_CLUSTER_ID}_${group_name}-${instance_id}"
+
+        section "Node: $instance_id ($group_name)"
+        info "  SSM target: $target"
+        info "  Connecting (timeout 60s)..."
+
+        local output
+        output=$(_ssm_run "$instance_id" "$group_name" "$SSM_CLUSTER_ID" "$_NODE_DIAG_SCRIPT")
+
+        # Detect SSM transport failures comprehensively — missing EOF / "Cannot
+        # perform start session" previously let error text fall through as if it
+        # were diagnostic output, producing a misleading "0 [PASS]" finding.
+        if [[ -z "$output" ]] || echo "$output" | grep -qiE "SessionManagerPlugin|error.*session|not authorized|AccessDenied|Could not connect|^Cannot perform start session|EOF$|ThrottlingException|RequestLimitExceeded|InternalFailure|ServiceUnavailable|TargetNotConnected"; then
+            warn "  SSM connection failed for $instance_id → references/operations.md § 4 SSM Setup for Node Hardware Checks"
+            node_warn=$((node_warn + 1))
+            continue
+        fi
+
+        echo "$output"
+
+        local passes fails
+        passes=$(echo "$output" | { grep -c "^\[PASS\]" 2>/dev/null; true; })
+        fails=$(echo "$output" | { grep -c "^\[FAIL\]" 2>/dev/null; true; })
+
+        # Detect "non-GPU, non-EFA" nodes — controllers, logins, or CPU instance
+        # families sampled by accident. A PASS with zero GPU/EFA context is
+        # misleading for an NCCL diagnostic, so we flag it as SKIP rather than
+        # PASS to make the output honest.
+        local is_non_gpu=false
+        if echo "$output" | grep -qE "^\[INFO\].*(CPU-only node|non-EFA instance|no GPU devices visible|nvidia-smi not found)"; then
+            if ! echo "$output" | grep -qE "^\[PASS\] EFA provider available|^\[PASS\] GPU row-remap"; then
+                is_non_gpu=true
+            fi
+        fi
+
+        if [[ "$fails" -gt 0 ]]; then
+            error "  Node $instance_id: $fails hardware issue(s) detected — see above"
+            node_fail=$((node_fail + 1))
+            # XID errors or EFA fail on a GPU instance = cluster-level issue
+            if echo "$output" | grep -q "\[FAIL\] GPU XID"; then
+                ISSUES_FOUND=$((ISSUES_FOUND + 1))
+                add_issue_detail "XID errors on GPU hardware ($instance_id) → references/operations.md § Node Reboot & Replacement; hyperpod-node-debugger skill" "P0"
+            elif echo "$output" | grep -q "\[FAIL\] EFA provider NOT"; then
+                ISSUES_FOUND=$((ISSUES_FOUND + 1))
+                add_issue_detail "EFA provider failure on $instance_id → references/debugging-guide.md § 6 EFA Configuration / § 13 EFA TCP Fallback" "P0"
+            fi
+        elif $is_non_gpu; then
+            info "  Node $instance_id: no GPU/EFA present — skipping (NCCL checks apply only to GPU/EFA compute nodes)"
+            node_warn=$((node_warn + 1))
+        else
+            success "  Node $instance_id: hardware checks passed ($passes [PASS])"
+            node_pass=$((node_pass + 1))
+        fi
+    done
+
+    echo ""
+    info "Hardware check summary: $node_pass PASS | $node_warn UNREACHABLE | $node_fail FAIL"
+    if [[ "$node_fail" -gt 0 ]]; then
+        warn "  $node_fail node(s) have hardware issues — check above for details"
+        warn "  For ALL nodes: re-run with --sample-nodes <total> to check every node"
+    fi
+    if [[ "$node_warn" -gt 0 ]]; then
+        warn "  $node_warn node(s) unreachable via SSM — verify SSM agent and IAM permissions"
+    fi
+}
+
+# CloudWatch covers ALL nodes at once without per-node SSM calls.
+# This runs for EKS when K8S_CONNECTED=false (can't use kubectl logs).
+check_cloudwatch_nccl_logs() {
+    header "Check 6b: NCCL Pattern Analysis via CloudWatch"
+
+    local cluster_arn cluster_id
+    cluster_arn=$(aws sagemaker describe-cluster \
+        --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+        --query 'ClusterArn' --output text 2>/dev/null || echo "")
+    cluster_id=$(echo "$cluster_arn" | awk -F'/' '{print $NF}')
+
+    if [[ -z "$cluster_id" || "$cluster_id" == "None" ]]; then
+        info "Cluster ID unavailable — skipping CloudWatch log analysis"
+        return
+    fi
+
+    local log_group="/aws/sagemaker/Clusters/${CLUSTER_NAME}/${cluster_id}"
+    info "CloudWatch log group: $log_group"
+
+    local lg_exists
+    lg_exists=$(aws logs describe-log-groups \
+        --log-group-name-prefix "$log_group" --region "$REGION" \
+        --query 'logGroups[0].logGroupName' --output text 2>&1) || {
+        if echo "$lg_exists" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+            warn "Permission denied: logs:DescribeLogGroups — check IAM policy"
+        fi
+        lg_exists=""
+    }
+
+    if [[ -z "$lg_exists" || "$lg_exists" == "None" ]]; then
+        info "CloudWatch log group not found — CloudWatch agent may not be configured"
+        info "  Setup guide: https://docs.aws.amazon.com/sagemaker/latest/dg/smcluster-cw-logs.html"
+        return
+    fi
+
+    local start_time=$(( ($(date +%s) - 7200) * 1000 ))
+    local patterns=(
+        "NCCL WARN" "Watchdog timeout" "Timeout waiting for"
+        "fi_getinfo failed" "unhandled system error" "nccl error"
+        "Connection refused" "NCCL_OFI_RDMA"
+    )
+
+    local found_any=false
+    for pattern in "${patterns[@]}"; do
+        local matches
+        matches=$(aws logs filter-log-events \
+            --log-group-name "$log_group" \
+            --filter-pattern "\"$pattern\"" \
+            --start-time "$start_time" \
+            --region "$REGION" \
+            --query 'events[*].{t:timestamp,s:logStreamName,m:message}' \
+            --output json 2>/dev/null || echo "[]")
+
+        local count
+        count=$(echo "$matches" | python3 -c \
+            "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo 0)
+
+        if [[ "$count" -gt 0 ]]; then
+            error "CloudWatch: '$pattern' found $count time(s) in last 2h:"
+            echo "$matches" | python3 -c "
+import sys,json,datetime
+events=json.load(sys.stdin)[:5]
+for e in events:
+    ts=datetime.datetime.utcfromtimestamp(e['t']//1000).strftime('%H:%M:%S')
+    stream=e['s'][:30]
+    msg=e['m'][:120].strip()
+    print(f'  [{ts}] {stream}: {msg}')
+" 2>/dev/null
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "CloudWatch pattern '$pattern' found ${count} time(s) → references/error-patterns-quick-ref.md" "P1"
+            found_any=true
+        fi
+    done
+
+    $found_any || success "No NCCL error patterns in CloudWatch logs (last 2h)"
+}
+
+# Slurm: run command on head node via SSM (start-session, not send-command)
+run_slurm_cmd_via_ssm() {
+    local cmd="$1"
+
+    if ! resolve_cluster_nodes_for_ssm; then
+        return 1
+    fi
+
+    # Find the controller/head node — paginate so we see all instance groups.
+    local all_nodes
+    all_nodes=$(sagemaker_list_paginated list-cluster-nodes ClusterNodeSummaries)
+
+    local head_entry
+    head_entry=$(echo "$all_nodes" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    g=n.get('InstanceGroupName','').lower()
+    if any(x in g for x in ['controller','head','master']):
+        print(n['InstanceId'] + ' ' + n['InstanceGroupName'])
+        break
+else:
+    # Fall back to first running node
+    for n in nodes:
+        if n.get('InstanceStatus',{}).get('Status') == 'Running':
+            print(n['InstanceId'] + ' ' + n['InstanceGroupName'])
+            break
+" 2>/dev/null || echo "")
+
+    [[ -z "$head_entry" ]] && return 1
+
+    local iid grp
+    iid=$(echo "$head_entry" | awk '{print $1}')
+    grp=$(echo "$head_entry" | awk '{print $2}')
+
+    _ssm_run "$iid" "$grp" "$SSM_CLUSTER_ID" "$cmd"
+}
+
+check_slurm_nodes() {
+    header "Check 2 [Slurm]: Node States"
+
+    local sinfo_output=""
+    if command -v sinfo &>/dev/null; then
+        sinfo_output=$(sinfo -o "%N %T %30E" --noheader 2>/dev/null || echo "")
+    else
+        sinfo_output=$(run_slurm_cmd_via_ssm "sinfo -o '%N %T %30E' --noheader" || echo "")
+    fi
+
+    # Treat SSM transport errors as retrieval failures, not as healthy state.
+    # Without this, "Cannot perform start session: EOF" is non-empty and falls
+    # through the empty-check below → grep finds no "down" → misleading [PASS].
+    if echo "$sinfo_output" | grep -qiE "^(Cannot perform start session|SessionManagerPlugin is not found)|EOF$|TargetNotConnected|ThrottlingException|RequestLimitExceeded|InternalFailure|ServiceUnavailable|AccessDenied|UnauthorizedOperation|not authorized to perform"; then
+        warn "Could not retrieve Slurm node states — SSM transient error after retries"
+        info "  Rerun the diagnostic; if persistent, delegate to hyperpod-ssm skill for manual probe."
+        return
+    fi
+
+    if [[ -z "$sinfo_output" ]]; then
+        warn "Could not retrieve Slurm node states"
+        return
+    fi
+
+    local down drained
+    down=$(echo "$sinfo_output" | grep -E "\bdown\b|\bdraining\b" | awk '{print $1}' || echo "")
+    drained=$(echo "$sinfo_output" | grep -E "\bdrained\b" | awk '{print $1}' || echo "")
+
+    if [[ -z "$down" && -z "$drained" ]]; then
+        success "All Slurm nodes: UP/IDLE/ALLOC — no NCCL-impacting states"
+    else
+        if [[ -n "$down" ]]; then
+            error "DOWN/DRAINING nodes: $down"
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "Slurm nodes DOWN/DRAINING: $down → references/operations.md § Slurm Node Management" "P1"
+            while IFS= read -r node; do
+                [[ -z "$node" ]] && continue
+            done <<< "$(echo "$down" | tr ',' '\n')"
+        fi
+        [[ -n "$drained" ]] && warn "DRAINED nodes (not available): $drained"
+    fi
+
+    section "Slurm Job Queue"
+    local q=""
+    if command -v squeue &>/dev/null; then
+        q=$(squeue -o "%i %j %T %R %N" --noheader 2>/dev/null || echo "")
+    fi
+    if [[ -z "$q" ]]; then
+        q=$(run_slurm_cmd_via_ssm "squeue -o '%i %j %T %R %N' --noheader" 2>/dev/null || echo "")
+    fi
+
+    # Same SSM-error detection as above — otherwise the error string is parsed
+    # as a job list and we'd report false "stuck" rows.
+    if echo "$q" | grep -qiE "^(Cannot perform start session|SessionManagerPlugin is not found)|EOF$|TargetNotConnected|ThrottlingException|RequestLimitExceeded|InternalFailure|ServiceUnavailable|AccessDenied|UnauthorizedOperation|not authorized to perform"; then
+        warn "Could not retrieve Slurm job queue — SSM transient error after retries"
+        q=""
+    fi
+
+    if [[ -z "$q" ]]; then
+        info "No jobs in queue"
+    else
+        local stuck
+        stuck=$(echo "$q" | grep -E "PENDING|COMPLETING" | head -10 || echo "")
+        if [[ -n "$stuck" ]]; then
+            warn "Stuck PENDING/COMPLETING jobs:"
+            echo "$stuck"
+            ISSUES_FOUND=$((ISSUES_FOUND+1))
+            add_issue_detail "Stuck PENDING/COMPLETING Slurm jobs → references/operations.md § 9 Slurm-Specific Operations" "P1"
+        else
+            success "No stuck jobs in queue"
+        fi
+        info "Queue (top 10):"; echo "$q" | head -10
+    fi
+}
+
+check_slurm_nccl_logs() {
+    header "Check 6 [Slurm]: NCCL Log Pattern Analysis"
+    # Reuse CloudWatch analysis (scales to all nodes)
+    check_cloudwatch_nccl_logs
+}
+
+check_slurm_nccl_env() {
+    header "Check 7 [Slurm]: NCCL Environment Variable Audit (via SSM)"
+
+    local env_check
+    env_check=$(run_slurm_cmd_via_ssm \
+        "{ cat /etc/profile.d/nccl.sh /opt/ml/config/nccl.conf /etc/slurm/prolog.d/*.sh 2>/dev/null; env; } \
+         | grep -E '^(NCCL_|FI_|MASTER_)' | sort -u | head -30 || echo '(none)'" \
+        2>/dev/null || echo "")
+
+    # If SSM returned a transport error, don't interpret it as the controller's
+    # env output — that produces false "FI_PROVIDER=efa not set" warnings.
+    if echo "$env_check" | grep -qiE "^(Cannot perform start session|SessionManagerPlugin is not found)|EOF$|TargetNotConnected|ThrottlingException|RequestLimitExceeded|InternalFailure|ServiceUnavailable|AccessDenied|UnauthorizedOperation|not authorized to perform"; then
+        warn "Could not retrieve NCCL env vars from controller — SSM transient error after retries"
+        info "  Rerun the diagnostic; if persistent, delegate to hyperpod-ssm skill."
+        return
+    fi
+
+    if [[ -n "$env_check" && "$env_check" != "(none)" ]]; then
+        info "NCCL/EFA env vars on head node:"
+        echo "$env_check" | while IFS= read -r line; do info "  $line"; done
+
+        local warn_count=0
+        if echo "$env_check" | grep -q "NCCL_DEBUG=INFO"; then
+            warn "NCCL_DEBUG=INFO detected — 10-30% training slowdown. Set NCCL_DEBUG=WARN for production."
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "NCCL_DEBUG=INFO in Slurm env (set NCCL_DEBUG=WARN in production) → references/operations.md § 7 Environment Variable Quick Reference" "P1"
+            warn_count=$((warn_count + 1))
+        fi
+        if echo "$env_check" | grep -q "NCCL_DEBUG=TRACE"; then
+            warn "NCCL_DEBUG=TRACE detected — 50-80% overhead. Set NCCL_DEBUG=WARN immediately."
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "NCCL_DEBUG=TRACE in Slurm env (set NCCL_DEBUG=WARN immediately) → references/operations.md § 7 Environment Variable Quick Reference" "P0"
+            warn_count=$((warn_count + 1))
+        fi
+        if ! echo "$env_check" | grep -q "FI_PROVIDER=efa"; then
+            warn "FI_PROVIDER=efa not set — EFA may not be used for NCCL transport"
+            warn_count=$((warn_count + 1))
+        fi
+        if ! echo "$env_check" | grep -q "NCCL_SOCKET_IFNAME"; then
+            warn "NCCL_SOCKET_IFNAME not set — NCCL may pick wrong interface. Recommend: ^lo,docker,efa,veth"
+            warn_count=$((warn_count + 1))
+        fi
+        if [[ "$warn_count" -eq 0 ]]; then
+            success "System-level NCCL env vars look correct"
+        fi
+    else
+        info "No NCCL env vars found in system config on head node"
+        info "  (Expected — NCCL vars are typically set in job scripts, not system-wide)"
+    fi
+}
+
+check_slurm_controller_health() {
+    # Slurm controller health — retry up to 3× before declaring it down, because
+    # SSM cold-start / session-service EOF errors are common on the first call.
+    header "Check 0 [Slurm]: Controller Health"
+    local ping_result=""
+    for _ in 1 2 3; do
+        ping_result=$(run_slurm_cmd_via_ssm "scontrol ping 2>/dev/null" || echo "")
+        [[ -n "$ping_result" ]] && echo "$ping_result" | grep -qi "is UP\|slurmctld.*UP" && break
+        sleep 3
+    done
+    if echo "$ping_result" | grep -qi "is UP\|slurmctld.*UP"; then
+        success "slurmctld is responsive"
+    elif echo "$ping_result" | grep -qiE "AccessDenied|UnauthorizedOperation|not authorized to perform"; then
+        # IAM denial ≠ Slurm failure. Reporting "slurmctld down" would be wrong
+        # and would send the customer down a Slurm-rescue path for an IAM issue.
+        warn "Could not check slurmctld — caller lacks ssm:StartSession on this cluster"
+        info "  Grant ssm:StartSession on the HyperPod cluster ARN and rerun."
+    elif echo "$ping_result" | grep -qiE "Cannot perform start session|SessionManager|EOF$|TargetNotConnected|ConnectTimeout|ServiceError|ThrottlingException|RequestLimitExceeded|InternalFailure|ServiceUnavailable"; then
+        # Transport-level SSM errors — not a Slurm failure. Downgrade to WARN.
+        warn "Could not reach controller via SSM (transient): $(echo "$ping_result" | head -1)"
+        info "  Rerun the diagnostic; if the error persists, delegate to hyperpod-ssm skill."
+    elif [[ -n "$ping_result" ]]; then
+        error "slurmctld not responding — all Slurm operations blocked"
+        local _diag_line
+        _diag_line="$(echo "$ping_result" | head -1)"
+        info "  Controller response: $_diag_line"
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "slurmctld down on controller → references/operations.md § 9 Slurm-Specific Operations" "P0"
+    else
+        info "Could not reach controller via SSM — slurmctld status unknown"
+    fi
+
+    local munge_result
+    munge_result=$(run_slurm_cmd_via_ssm "systemctl is-active munge 2>/dev/null || echo munge_inactive" || echo "")
+    if echo "$munge_result" | grep -q "^active"; then
+        success "munge authentication service active"
+    elif echo "$munge_result" | grep -q "munge_inactive"; then
+        error "munge service inactive — Slurm auth will fail"
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "munge service inactive → references/operations.md § 9 Slurm-Specific Operations" "P0"
+    fi
+}
+
+run_slurm_checks() {
+    check_slurm_controller_health
+    check_cluster_health
+    check_slurm_nodes
+    check_cluster_events
+    check_security_groups
+    check_slurm_nccl_logs
+    check_slurm_nccl_env
+    check_node_hardware_via_ssm
+}
+
+print_summary() {
+    header "NCCL Diagnostic Summary"
+    echo ""
+    echo -e "  Cluster:      ${BOLD}$CLUSTER_NAME${RESET}"
+    echo -e "  Region:       ${BOLD}$REGION${RESET}"
+    echo -e "  Orchestrator: ${BOLD}${ORCHESTRATOR^^}${RESET}"
+    [[ "$ORCHESTRATOR" == "eks" ]] && \
+        echo -e "  Namespace:    ${BOLD}${NAMESPACE:-all}${RESET}"
+    [[ -n "$JOB_NAME" ]]  && echo -e "  Job:          ${BOLD}$JOB_NAME${RESET}"
+    [[ -n "$NODE_ID" ]]   && echo -e "  Node:         ${BOLD}$NODE_ID${RESET}"
+    echo -e "  Mode:         ${BOLD}READ-ONLY${RESET} (no changes applied)"
+    echo ""
+    echo -e "  ┌──────────────────────────────────┐"
+    echo -e "  │  Issues Found:  ${RED}${BOLD}$ISSUES_FOUND${RESET}                │"
+    echo -e "  └──────────────────────────────────┘"
+
+    if [[ ${#ISSUE_DETAILS[@]} -gt 0 ]]; then
+        echo ""
+        echo "  Issue Details (prioritized):"
+        for priority in P0 P1 P2; do
+            local has_items=false
+            for detail in "${ISSUE_DETAILS[@]}"; do
+                if [[ "$detail" == "${priority}|"* ]]; then
+                    if ! $has_items; then
+                        case "$priority" in
+                            P0) echo -e "    ${RED}${BOLD}[$priority — Fix Immediately]${RESET}" ;;
+                            P1) echo -e "    ${YELLOW}${BOLD}[$priority — Fix Soon]${RESET}" ;;
+                            P2) echo -e "    ${BOLD}[$priority — Advisory]${RESET}" ;;
+                        esac
+                        has_items=true
+                    fi
+                    echo "      → ${detail#*|}"
+                fi
+            done
+        done
+    fi
+    echo ""
+
+    if [[ "$ISSUES_FOUND" -eq 0 ]]; then
+        success "No actionable NCCL issues detected — cluster looks healthy"
+        echo ""
+        info "If training is still hanging, check:"
+        echo "  1. CloudWatch: aws logs filter-log-events --log-group-name /aws/sagemaker/Clusters/$CLUSTER_NAME/..."
+        echo "  2. Version check: hyperpod-version-checker skill"
+        echo "  3. Full diagnostics: hyperpod-issue-report skill"
+    else
+        warn "$ISSUES_FOUND issue(s) found — see the Issue Details list above."
+        warn "Each issue line includes a reference pointer (→ references/<file>.md § <section>)."
+        warn "The hyperpod-nccl skill will read these findings, look up the matching section,"
+        warn "and guide you through remediation. This script does not modify cluster state."
+    fi
+    echo ""
+    echo -e "${BOLD}References:${RESET}"
+    echo "  Debugging guide:  references/debugging-guide.md"
+    echo "  Operations:       references/operations.md"
+    echo "  Performance test: references/performance-testing.md"
+    echo ""
+}
+
+main() {
+    header "NCCL Diagnostic — SageMaker HyperPod (read-only)"
+
+    detect_orchestrator
+
+    echo -e "  Cluster:      ${BOLD}$CLUSTER_NAME${RESET}"
+    echo -e "  Region:       ${BOLD}$REGION${RESET}"
+    echo -e "  Orchestrator: ${BOLD}${ORCHESTRATOR^^}${RESET}"
+    [[ "$ORCHESTRATOR" == "eks" ]] && echo -e "  Namespace:    ${BOLD}${NAMESPACE:-all}${RESET}"
+    info "READ-ONLY DIAGNOSTIC — no cluster state will be modified."
+    info "This script collects signals only. The hyperpod-nccl skill interprets findings"
+    info "and looks up remediation in references/*.md."
+    echo ""
+
+    check_prerequisites
+
+    if [[ "$ORCHESTRATOR" == "slurm" ]]; then
+        info "Running Slurm NCCL diagnostics..."
+        run_slurm_checks
+    else
+        info "Running EKS NCCL diagnostics..."
+
+        check_cluster_health
+        check_cluster_events
+        check_security_groups
+
+        if $K8S_CONNECTED; then
+            check_k8s_nodes
+            check_efa_k8s
+            check_pod_status
+            check_nccl_infra_prereqs
+            analyze_nccl_logs
+            check_nccl_env_vars
+            check_network_policies
+        else
+            warn "K8s checks skipped (2, 2b, 5, 5b, 6, 7, 9) — kubectl not authenticated"
+            # Fall back to CloudWatch log analysis which doesn't need kubectl
+            check_cloudwatch_nccl_logs
+        fi
+
+        # Independent of kubectl; samples SAMPLE_NODES compute nodes
+        check_node_hardware_via_ssm
+    fi
+
+    print_summary
+}
+
+main "$@"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/SKILL.md
new file mode 100644
index 00000000..5d272d4e
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/SKILL.md
@@ -0,0 +1,292 @@
+---
+name: hyperpod-node-debugger
+description: Use for per-node issues on a SageMaker HyperPod cluster (EKS or Slurm) — a specific node is unhealthy, unresponsive, stuck, or needs replacing. Covers EFA on-node failures, GPU / accelerator hardware (XID, ECC, NVLink), Slurm node down/drained, disk full, OOM, /dev/shm, AMI drift, per-node lifecycle-script failures, SSM agent issues, NVMe not mounted, time sync, container runtime crashes, kernel panics. Read-only diagnostic — every detected issue points at a references/node-diagnostics-detail.md section; the skill opens the reference and guides the customer through the fix with explicit approval. Do NOT use for cluster-wide creation / provisioning failures or dangling-node reconciliation (→ hyperpod-cluster-debugger), NCCL or distributed-training-specific failures (→ hyperpod-nccl), MFU / performance degradation (→ hyperpod-mfu-debugger). Always run Step 1 triage first.
+metadata:
+  version: "1.0.0"
+---
+
+# HyperPod Node Debugger
+
+Structured, read-only triage for HyperPod node issues.
+
+**Clear separation of concerns:**
+
+- `scripts/triage-cluster.sh` and its helpers are **read-only signal collectors**. They read cluster + node state and print each detected issue as an entry in the Issues Found list. Every issue message ends with a pointer of the form `→ references/node-diagnostics-detail.md § <section>`. The scripts never print remediation commands and never modify state.
+- [references/node-diagnostics-detail.md](references/node-diagnostics-detail.md) contains the full remediation runbook for every issue the scripts can detect — root cause, preconditions, exact commands, verification.
+- [references/node-issue-catalog.md](references/node-issue-catalog.md) is a catalog of patterns from real customer tickets, organized by symptom.
+- This SKILL.md is the **playbook for Claude**: run the scripts, read each finding's pointer, open the referenced section, walk the customer through the fix.
+
+---
+
+## Workflow (authoritative)
+
+1. **Collect inputs** — cluster name, region, specific instance ID if one node is suspect, exact error message from the customer's logs.
+2. **Run `scripts/triage-cluster.sh`** — it covers cluster identity, events, per-node health, VPC/SG, SSM reachability, and on-node resource checks (disk, memory, /dev/shm, OOM, NVMe, time sync, SSM agent). Route specialized checks to `scripts/check-efa-sg.sh`, `scripts/check-node-reachability.sh`, `scripts/check-vpc-config.sh` when relevant.
+3. **Read the script output top-to-bottom.** Ignore `[PASS]` lines; for every `[FAIL]` / issue entry, note the trailing `→ references/node-diagnostics-detail.md § <section>` pointer.
+4. **Open each referenced section.** Use the `Read` tool on the exact file path. Do not paraphrase remediation from memory.
+5. **Present the remediation to the customer.** For each finding, state:
+   - What the script detected (copy the failure line verbatim).
+   - Root cause (from the referenced section).
+   - The exact command(s) to run, with concrete values (instance IDs, SG IDs) interpolated from the script output.
+   - Blast radius (e.g. "reboots node i-xxx", "wipes instance volumes on replacement").
+6. **Wait for explicit customer approval** before running any state-changing command. Destructive order: investigate → reboot → replace (reboot first, replace only if reboot fails).
+7. **Re-run triage** after remediation to confirm the failure is cleared. If not, iterate.
+
+## Step 1: Full cluster triage (always start here)
+
+```bash
+# Diagnose (read-only — no changes):
+bash scripts/triage-cluster.sh --cluster <CLUSTER_NAME_OR_ARN> --region <REGION>
+
+# For a specific node:
+bash scripts/triage-cluster.sh --cluster <CLUSTER> --region <REGION> --node <INSTANCE_ID>
+```
+
+One pass collects: cluster status + NodeRecovery, cluster events, per-node health (HyperPod + EKS labels, Slurm states), VPC/SG snapshot, CloudWatch log availability, SSM readiness, on-node resource checks (disk, memory, /dev/shm, OOM, NVMe, time sync, SSM agent), and Slurm node-to-instance-ID mapping. Issues are categorized:
+
+- **P0** — Fix immediately (blocks operation)
+- **P1** — Fix soon (degraded or at-risk)
+- **P2** — Informational (review when convenient)
+
+### Output tags
+
+| Tag      | Meaning                                                                     |
+| -------- | --------------------------------------------------------------------------- |
+| `[PASS]` | Check passed                                                                |
+| `[FAIL]` | Problem found — counted in the issue list with a `→ references/...` pointer |
+| `[WARN]` | Advisory                                                                    |
+| `[INFO]` | Informational                                                               |
+
+The script never prints remediation commands. Each `[FAIL]` / issue entry ends with a pointer of the form `→ references/node-diagnostics-detail.md § <section>`. Open the referenced section with `Read` to find the remediation runbook.
+
+## Step 2: Match signal → section
+
+**From `list-cluster-events` (provisioning-time failures):**
+
+| Event message                                                  | Section                                                         |
+| -------------------------------------------------------------- | --------------------------------------------------------------- |
+| `"EFA health checks did not run successfully"`                 | **[A: EFA/SG](#a-efa--security-group)**                         |
+| `"Instance bootstrap failed…network misconfiguration"`         | **[A](#a-efa--security-group)** + **[B: VPC](#b-vpc--routing)** |
+| `"Lifecycle scripts did not run successfully"` / `"timed out"` | **[D: Lifecycle](#d-lifecycle-scripts)**                        |
+| `"Insufficient capacity"` / `"No subnets in the capacity AZ"`  | **[C: Capacity](#c-capacity--az)**                              |
+| `"Instance likely experienced a hardware failure"`             | **[F: Hardware](#f-hardware--auto-repair)**                     |
+| `"Failed to provision EC2 Instance"`                           | **[C](#c-capacity--az)** or **[F](#f-hardware--auto-repair)**   |
+
+**From EKS node labels** (`kubectl get nodes --show-labels`):
+
+| Label                                                 | Go to                                                                 |
+| ----------------------------------------------------- | --------------------------------------------------------------------- |
+| `node-health-status: UnschedulablePendingReplacement` | **[F: Hardware](#f-hardware--auto-repair)**                           |
+| `node-health-status: UnschedulablePendingReboot`      | **[F: Hardware](#f-hardware--auto-repair)**                           |
+| `deep-health-check-status: Failed`                    | **[G: GPU](#g-gpu--accelerator)** → **[F](#f-hardware--auto-repair)** |
+| `fault-type: NetworkFaultType`                        | **[A: EFA/SG](#a-efa--security-group)**                               |
+
+**From symptoms:**
+
+| Symptom                                                | Section                                                                   |
+| ------------------------------------------------------ | ------------------------------------------------------------------------- |
+| Training hangs at NCCL init / AllReduce                | **[A](#a-efa--security-group)** → **[E: Versions](#e-software-versions)** |
+| Slurm node `down` / `"Node unexpectedly rebooted"`     | **[H: Slurm](#h-slurm-node-management)**                                  |
+| Jobs stuck PENDING / COMPLETING                        | **[H: Slurm](#h-slurm-node-management)**                                  |
+| Auto-repair not triggering                             | **[F: Hardware](#f-hardware--auto-repair)**                               |
+| GPU not visible / XID errors / ECC errors              | **[G: GPU](#g-gpu--accelerator)**                                         |
+| GPU row-remap pending/failed / silent NaNs / DCGM Fail | **[G: GPU](#g-gpu--accelerator)** (see § G.1.a/b in references)           |
+| Disk full / OOM / `"Cannot allocate memory"`           | **[I: Resources](#i-resource-exhaustion)**                                |
+| Wrong vCPU count (96 instead of 192)                   | **[J: Config](#j-configuration)**                                         |
+| Container CrashLoopBackOff / runtime crash             | **[M: Container Runtime](#m-container-runtime)**                          |
+| aws-node CrashLoopBackOff / gRPC 50051 refused         | **[O: CNI / Pod Networking](#o-cni--pod-networking)**                     |
+| Pods stuck Pending with no IP / CNI error              | **[O: CNI / Pod Networking](#o-cni--pod-networking)**                     |
+| DNS resolution failing / `enableDnsSupport`            | **[B: VPC / Routing](#b-vpc--routing)** (§ B.2)                           |
+| Public subnet misconfigured / IGW on private subnet    | **[B: VPC / Routing](#b-vpc--routing)** (§ B.3)                           |
+| Missing VPC endpoints in air-gapped VPC (ECR/STS/FSx)  | **[B: VPC / Routing](#b-vpc--routing)** (§ B.4)                           |
+| EKS VPC or SG not matching HyperPod cluster            | **[B: VPC / Routing](#b-vpc--routing)** (§ B.5)                           |
+| Kernel panic / watchdog timeout / system hang          | **[N: Kernel & System](#n-kernel--system)**                               |
+| Need shell on a node                                   | **[K: SSM Access](#k-node-access-via-ssm)**                               |
+| Need logs for AWS Support                              | **[L: Log Collection](#l-log-collection)**                                |
+
+---
+
+## A: EFA / Security Group
+
+EFA failures are the most common provisioning blocker. Run `scripts/check-efa-sg.sh` to auto-discover SGs/subnets and validate required self-referencing rules. For on-node EFA checks, upload and run `scripts/check-node-reachability.sh` via SSM.
+Full procedure: [references/node-diagnostics-detail.md § A](references/node-diagnostics-detail.md#a-efa--security-group).
+
+## B: VPC / Routing
+
+Covers SG/subnet VPC mismatch, missing S3 Gateway endpoints, EKS auth-mode issues, worker→controller routing. Run `scripts/check-vpc-config.sh`.
+Full procedure: [references/node-diagnostics-detail.md § B](references/node-diagnostics-detail.md#b-vpc--routing).
+
+## C: Capacity / AZ
+
+Triggered by `Insufficient capacity` or `No subnets in the capacity AZ`. Check AZ availability, then add a subnet in the correct AZ or use Flexible Training Plans / ODCR.
+Full procedure: [references/node-diagnostics-detail.md § C](references/node-diagnostics-detail.md#c-capacity--az).
+
+## D: Lifecycle Scripts
+
+Lifecycle failures show in cluster events and CloudWatch under `LifecycleConfig/<group>/<instance-id>`. Common causes: S3 connectivity, IAM gaps, CRLF line endings, infinite loops, parameter name mismatches.
+Full procedure: [references/node-diagnostics-detail.md § D](references/node-diagnostics-detail.md#d-lifecycle-scripts).
+
+## E: Software Versions
+
+Delegate to `hyperpod-version-checker` skill to compare NVIDIA driver, CUDA, NCCL, EFA installer, PyTorch across nodes. Ensure job env includes `FI_PROVIDER=efa`, `FI_EFA_USE_DEVICE_RDMA=1`, `NCCL_SOCKET_IFNAME=^lo,docker`.
+Full procedure: [references/node-diagnostics-detail.md § E](references/node-diagnostics-detail.md#e-software-versions).
+
+## F: Hardware / Auto-Repair
+
+Check `NodeRecovery` is enabled, inspect EKS health labels and repair events. For Slurm, auto-repair triggers only when the node reason is exactly `Action:Reboot` or `Action:Replace`. Manual recovery: try `batch-reboot-cluster-nodes` first, then `batch-replace-cluster-nodes` only if reboot does not clear the fault.
+Full procedure: [references/node-diagnostics-detail.md § F](references/node-diagnostics-detail.md#f-hardware--auto-repair) and [references/node-issue-catalog.md](references/node-issue-catalog.md).
+
+## G: GPU / Accelerator
+
+**NVIDIA GPUs (p4d/p5/g5/g6):** Run `nvidia-smi` queries and `dmesg` via SSM to check XID errors, ECC counts, thermal throttling. Thresholds: CE < 100/day normal; any UCE means drain and replace.
+
+**AWS Trainium / Inferentia (trn1/trn2/inf2):** Use Neuron SDK — `neuron-ls`, `neuron-top`, `neuron-monitor`. Common issues: Neuron driver not loaded, `neuron-rtd` not running, NeuronCore count mismatch, OOM on NeuronDevice memory.
+
+GPU/accelerator failure flows into Section F for node replacement.
+Full procedure: [references/node-diagnostics-detail.md § G](references/node-diagnostics-detail.md#g-gpuaccelerator).
+
+## H: Slurm Node Management
+
+Covers node down/unresponsive, unexpected reboots, stuck jobs (PENDING/COMPLETING), and Slurm-to-instance-ID translation. Primary access is via SSM; check `slurmd`, restart if needed, then `scontrol update state=resume` **only after** confirming the underlying cause is resolved.
+Full procedure: [references/node-diagnostics-detail.md § H](references/node-diagnostics-detail.md#h-slurm-node-management).
+
+## I: Resource Exhaustion
+
+Covers disk full (root volume fixed at 100 GB), OOM, `os.fork()` memory errors, `/dev/shm` exhaustion. Key fix for fork memory errors: `export FI_EFA_USE_HUGE_PAGE=0`. Redirect large data to `/opt/sagemaker` (EBS) or `/opt/dlami/nvme` (instance store).
+Full procedure: [references/node-diagnostics-detail.md § I](references/node-diagnostics-detail.md#i-resource-exhaustion).
+
+## J: Configuration
+
+`p5.48xlarge` showing 96 vCPU instead of 192: caused by the console defaulting `ThreadsPerCore=1`. Fix with `update-cluster` setting `ThreadsPerCore=2`. CFN `UpdateCluster` must always include `ThreadsPerCore`.
+Full procedure: [references/node-diagnostics-detail.md § J](references/node-diagnostics-detail.md#j-configuration).
+
+## K: Node Access via SSM
+
+Direct SSH is not available on HyperPod. Use the `hyperpod-ssm` skill. Target format: `sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>`. If SSM fails, check for Session Manager plugin, the target prefix, IAM permissions, and VPC endpoints.
+Full procedure: [references/node-diagnostics-detail.md § K](references/node-diagnostics-detail.md#k-node-access-via-ssm).
+
+## L: Log Collection
+
+Delegate to `hyperpod-issue-report` for comprehensive S3-stored diagnostics. Key CloudWatch groups: `LifecycleConfig/<group>/<instance-id>`, `SagemakerHealthMonitoringAgent/<group>/<instance-id>`.
+Full procedure: [references/node-diagnostics-detail.md § L](references/node-diagnostics-detail.md#l-log-collection).
+
+## M: Container Runtime
+
+CrashLoopBackOff, container OOM kills, image pull failures, runtime crashes on EKS-orchestrated clusters. Check `kubectl describe pod` for `OOMKilled`, `ImagePullBackOff`, `RunContainerError`. On-node: `crictl ps -a`, `journalctl -u containerd`, `dmesg | grep -i oom`.
+Full procedure: [references/node-diagnostics-detail.md § M](references/node-diagnostics-detail.md#m-container-runtime).
+
+## N: Kernel & System
+
+Kernel panics, watchdog timeouts, system hangs, unexpected reboots not explained by HyperPod health monitoring. Check `dmesg | grep -iE 'panic|watchdog|hung_task|NMI'` and `journalctl -b -1`. Watchdog timeouts often indicate NVLink/PCIe hangs on GPU instances. Panics with `RIP: nvrm` point to NVIDIA driver crashes — reboot, and if recurring, replace the node.
+Full procedure: [references/node-diagnostics-detail.md § N](references/node-diagnostics-detail.md#n-kernel--system).
+
+## O: CNI / Pod Networking
+
+VPC CNI (`aws-node` DaemonSet) failures, IPAM errors, and kube-system networking pod crashes on EKS. When `aws-node` crashes, pods cannot get IPs and networking breaks. Typical symptoms: `aws-node` CrashLoopBackOff, `gRPC connection refused 127.0.0.1:50051`, pods stuck `Pending` with `FailedCreatePodSandBox`. The triage script auto-checks `aws-node`, `kube-proxy`, and CoreDNS.
+Full procedure: [references/node-diagnostics-detail.md § O](references/node-diagnostics-detail.md#o-cni--pod-networking).
+
+---
+
+## Read-only guarantee & remediation principle
+
+The scripts in this skill never mutate cluster state and never emit remediation commands. Each issue detected points at a `references/<file>.md § <section>`; open that section with `Read` to find the root cause, exact commands, verification, and blast radius. For destructive actions, the referenced runbook explicitly orders the steps (investigate → reboot first → replace only if reboot fails). Never execute a destructive command without the customer's explicit approval in the session.
+
+## Prerequisites
+
+Required on the machine running the skill:
+
+- `aws` CLI v2.13+ — authenticated to the AWS account that owns the HyperPod cluster.
+- `jq` — used for JSON parsing in the check-efa-sg/check-vpc-config helper scripts.
+- `python3` — used for JSON manipulation, SSM payload building, and cluster-events pagination.
+- `bash` 4.2+.
+
+Required for EKS clusters:
+
+- `kubectl` — authenticated to the EKS cluster. If absent or not authenticated, K8s-facing checks are skipped and the script reports it.
+
+Required for on-node hardware/resource checks:
+
+- `session-manager-plugin` (AWS Systems Manager) — required for SSM into compute nodes.
+
+See [references/node-diagnostics-detail.md § K (Node Access via SSM)](references/node-diagnostics-detail.md) for setup.
+
+## Defaults
+
+- **Region**: reads `$AWS_DEFAULT_REGION`; if unset, `us-east-1` is used. Explicitly pass `--region <R>` for clarity.
+- **Target node scope**: all nodes. Pass `--node <INSTANCE-ID>` to focus the on-node probe on one node.
+- **Event pagination**: up to 5 pages × 100 events = 500 most recent cluster events.
+- **Node list pagination**: up to 50 pages × 100 nodes = 5000 nodes (cap prevents runaway on misconfigured clusters).
+- **SSM command timeout**: ~90 seconds per on-node probe with exponential backoff polling.
+- **Output colors**: ANSI colors on; pass `--no-color` or pipe to a non-TTY to disable.
+- **Read-only**: the scripts NEVER modify cluster state and NEVER print remediation commands.
+
+## Error Handling
+
+| Failure mode                                               | Script behavior                                                                | What to tell the customer                                                     |
+| ---------------------------------------------------------- | ------------------------------------------------------------------------------ | ----------------------------------------------------------------------------- |
+| `aws sts get-caller-identity` fails                        | Exit 1                                                                         | "Fix AWS credentials and rerun."                                              |
+| `aws sagemaker describe-cluster` fails                     | Exit 1 after listing clusters in the region                                    | "Confirm cluster name and region."                                            |
+| Any `sagemaker:*` / `ec2:*` / `logs:*` AccessDenied        | Warn, add issue `Missing IAM permission for <API>`, continue with partial data | "Grant the listed IAM action and rerun."                                      |
+| `kubectl` absent / not authenticated                       | Skip K8s checks, note in summary                                               | "Install/authenticate kubectl (section 4 in node-diagnostics-detail.md § K)." |
+| `session-manager-plugin` absent                            | Skip on-node probes, warn                                                      | "Install session-manager-plugin; section K in references."                    |
+| SSM `send-command` returns non-terminal or times out (90s) | Return partial output, mark node unreachable with a `→ § K` pointer            | "Rerun with `--node <ID>` to isolate; verify SSM agent on the node."          |
+| Cluster has > 5000 nodes                                   | First 5000 paginated; warn about the cap                                       | "Use `--node` to target specific nodes."                                      |
+
+Exit codes: `0` = triage complete (issues may still exist — check output); `1` = cluster not found or fatal prerequisite missing.
+
+## IAM permissions required
+
+Read-only diagnostic:
+
+```json
+{
+  "Action": [
+    "sagemaker:DescribeCluster",
+    "sagemaker:DescribeClusterNode",
+    "sagemaker:ListClusterNodes",
+    "sagemaker:ListClusterEvents",
+    "ec2:DescribeSecurityGroups",
+    "ec2:DescribeSubnets",
+    "ec2:DescribeVpcs",
+    "ec2:DescribeInstances",
+    "logs:DescribeLogStreams",
+    "logs:FilterLogEvents",
+    "ssm:StartSession",
+    "ssm:TerminateSession"
+  ]
+}
+```
+
+> SSM on HyperPod uses `start-session` against `sagemaker-cluster:<cluster-id>_<group>-<iid>` targets, not `send-command` against plain instance IDs (HyperPod's managed instance fleet does not expose bare instance IDs to customer `SendCommand` calls). Grant `ssm:StartSession` and `ssm:TerminateSession` — not `ssm:SendCommand` / `ssm:GetCommandInvocation`.
+
+For each remediation the operator may run, the matching write permission is required (for example `ec2:AuthorizeSecurityGroupIngress` / `Egress`, `sagemaker:BatchRebootClusterNodes`, `sagemaker:BatchReplaceClusterNodes`, `eks:DescribeCluster`). These are not needed for the diagnostic itself.
+
+## Skill delegation
+
+| Need                                                   | Use                                                                |
+| ------------------------------------------------------ | ------------------------------------------------------------------ |
+| Cluster creation / deployment failures                 | `hyperpod-cluster-debugger` skill (§ A / B / C / H + `--validate`) |
+| Cluster-wide SSM outage (all nodes unreachable)        | `hyperpod-cluster-debugger` § F                                    |
+| SSM failure on a single node                           | stay here — § K                                                    |
+| Cluster-wide EFA health-check failure at creation time | `hyperpod-cluster-debugger` § A                                    |
+| Single-node EFA failure post-provisioning              | stay here — § A                                                    |
+| NCCL AllReduce / collective-op timeouts (distributed)  | `hyperpod-nccl` skill                                              |
+| Silent GPU NaNs on a specific node (row-remap / DCGM)  | stay here — § G.1 (even if discovered by NCCL)                     |
+| Post-deployment cluster-wide management                | `hyperpod-cluster-debugger` skill                                  |
+| Shell access / run commands on nodes                   | `hyperpod-ssm` skill                                               |
+| CUDA / NCCL / EFA version comparison                   | `hyperpod-version-checker` skill                                   |
+| Diagnostic bundle for AWS Support                      | `hyperpod-issue-report` skill                                      |
+| Training performance / MFU degradation                 | `hyperpod-mfu-debugger` skill                                      |
+| All-in-one first triage                                | `scripts/triage-cluster.sh`                                        |
+| EFA SG rules (cluster-centric)                         | `scripts/check-efa-sg.sh`                                          |
+| On-node EFA reachability                               | `scripts/check-node-reachability.sh` (via SSM)                     |
+| VPC / subnet / EKS config                              | `scripts/check-vpc-config.sh`                                      |
+
+## Escalate to AWS Support when
+
+1. All SG rules correct + reachability passes but EFA checks still fail.
+2. VPC correct but K8s bootstrap keeps failing — check VPC flow logs for REJECT entries.
+3. Hardware failure + replacement keeps failing (bad physical host).
+4. Node replacement fails with `Insufficient capacity` despite valid ODCR.
+
+Collect diagnostics with `scripts/triage-cluster.sh`, `scripts/check-efa-sg.sh`, and `hyperpod-issue-report` before escalating. See [references/node-issue-catalog.md](references/node-issue-catalog.md) for detailed issue patterns from real customer tickets.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-diagnostics-detail.md b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-diagnostics-detail.md
new file mode 100644
index 00000000..5b235582
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-diagnostics-detail.md
@@ -0,0 +1,1012 @@
+# Node Diagnostics Detail
+
+Full diagnostic procedures, commands, and fix instructions for each section referenced from SKILL.md.
+
+---
+
+## A: EFA / Security Group
+
+**Signals:** `"EFA health checks did not run successfully"`, EFA send/recv timeouts, NCCL connectivity fails.
+
+```bash
+# Cluster-centric: auto-discovers this cluster's exact SGs/subnets
+bash scripts/check-efa-sg.sh --cluster <CLUSTER> --region <REGION>
+```
+
+Required rules on every cluster SG:
+
+1. **Outbound self-ref (all protocols, source = the SG itself)** — required for EFA. This is the most commonly missing rule and is the #1 cause of "EFA health checks did not run successfully".
+2. **Inbound self-ref (all protocols, source = the SG itself)** — required for node-to-node communication.
+3. **Outbound 0.0.0.0/0** — required for AWS API calls, package downloads, and container image pulls.
+
+The script prints `[PASS]` / `[FAIL]` for each rule. Apply the fix with explicit customer approval:
+
+```bash
+# Outbound self-ref (EFA)
+aws ec2 authorize-security-group-egress --group-id <SG_ID> --region <REGION> \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"<SG_ID>","Description":"HyperPod EFA intra-SG"}]}]'
+
+# Inbound self-ref
+aws ec2 authorize-security-group-ingress --group-id <SG_ID> --region <REGION> \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"<SG_ID>","Description":"HyperPod intra-SG"}]}]'
+
+# Outbound internet (only if missing)
+aws ec2 authorize-security-group-egress --group-id <SG_ID> --region <REGION> \
+  --ip-permissions '[{"IpProtocol":"-1","IpRanges":[{"CidrIp":"0.0.0.0/0","Description":"Internet egress"}]}]'
+```
+
+**Idempotency:** `authorize-security-group-*` returns `InvalidPermission.Duplicate` if the rule already exists — treat this as success and continue.
+
+**For provisioned nodes with EFA problems** -- run on node via `hyperpod-ssm` skill:
+
+```bash
+# Upload and run the reachability check (from your workstation):
+# 1. Use hyperpod-ssm skill to get target: sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>
+# 2. Upload script to node:
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  --upload scripts/check-node-reachability.sh /tmp/check-node-reachability.sh
+# 3. Run it:
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'bash /tmp/check-node-reachability.sh'
+
+# Or quick spot-check (single command via SSM):
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'fi_info -p efa'
+```
+
+---
+
+## B: VPC / Routing
+
+**Signals:** `"bootstrap failed...network misconfiguration"`, S3 timeout, subnet/VPC mismatch, DNS resolution failures, node unreachable despite SG rules being correct.
+
+```bash
+bash scripts/check-vpc-config.sh --cluster <CLUSTER> --region <REGION>
+```
+
+### B.1 Common VPC / routing errors
+
+| Error                                                 | Fix                                                                                                           |
+| ----------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- |
+| SG + subnet in different VPCs                         | Move SG to same VPC as subnet                                                                                 |
+| S3 timeout `"Connect timeout on endpoint URL: s3://"` | Add S3 Gateway VPC endpoint to subnet route table                                                             |
+| EKS auth mode `CONFIG_MAP`                            | `aws eks update-cluster-config --name <N> --region <R> --access-config authenticationMode=API_AND_CONFIG_MAP` |
+| `aws-hyperpod` namespace missing                      | `kubectl create namespace aws-hyperpod`                                                                       |
+| Workers can't reach EKS controller                    | Add route to EKS VPC CIDR in worker subnet; check VPC flow logs                                               |
+
+### B.2 VPC DNS support / hostnames
+
+HyperPod requires both `enableDnsSupport` and `enableDnsHostnames` on the VPC. Without them, EKS internal service DNS, node internal hostnames, and `ip-x-x-x-x` Slurm nodenames fail to resolve.
+
+```bash
+# Diagnose
+aws ec2 describe-vpc-attribute --vpc-id <VPC_ID> --attribute enableDnsSupport   --region <R> --query 'EnableDnsSupport.Value'
+aws ec2 describe-vpc-attribute --vpc-id <VPC_ID> --attribute enableDnsHostnames --region <R> --query 'EnableDnsHostnames.Value'
+
+# Fix (both must be true)
+aws ec2 modify-vpc-attribute --vpc-id <VPC_ID> --region <R> --enable-dns-support '{"Value":true}'
+aws ec2 modify-vpc-attribute --vpc-id <VPC_ID> --region <R> --enable-dns-hostnames '{"Value":true}'
+```
+
+### B.3 Private subnets & NAT gateway (only private subnets supported)
+
+HyperPod requires **private subnets only**. A private subnet is one whose route table has no default route to an Internet Gateway. If outbound internet is needed, route `0.0.0.0/0` through a NAT Gateway placed in a separate public subnet. In a fully air-gapped VPC, the default route can be absent and outbound must go through VPC endpoints (see § B.4).
+
+```bash
+# Inspect route tables associated with the HyperPod subnets:
+aws ec2 describe-route-tables \
+  --filters "Name=association.subnet-id,Values=<subnet-1>,<subnet-2>" \
+  --region <R> \
+  --query "RouteTables[*].{Assoc:Associations[?SubnetId!=\`null\`].SubnetId,Routes:Routes[?DestinationCidrBlock==\`0.0.0.0/0\`]}" \
+  --output json
+```
+
+| Route target for `0.0.0.0/0` | Subnet type                    | Action                                                            |
+| ---------------------------- | ------------------------------ | ----------------------------------------------------------------- |
+| `igw-*` (Internet Gateway)   | **Public — not supported**     | Remove the IGW route; add a NAT Gateway and route to it           |
+| `nat-*` (NAT Gateway)        | Private with outbound internet | OK for most customers                                             |
+| Absent                       | Fully private / air-gapped     | OK if S3/ECR/STS/SSM/EC2 VPC endpoints are configured — see § B.4 |
+| `vpce-*`                     | Endpoint-only routing          | OK                                                                |
+
+### B.4 VPC endpoints for private (internet-disabled) VPCs
+
+When the VPC has no NAT Gateway, the HyperPod nodes need private interface endpoints for every AWS service they call. All interface endpoints listen on TCP/443 — the SG attached to the endpoint must allow inbound 443 from the HyperPod subnet CIDR.
+
+| Endpoint                                   | Type      | Required     | Purpose                                                            |
+| ------------------------------------------ | --------- | ------------ | ------------------------------------------------------------------ |
+| `com.amazonaws.<region>.s3`                | Gateway   | **Yes**      | Lifecycle scripts, DLC image layers                                |
+| `com.amazonaws.<region>.ecr.api`           | Interface | **Yes**      | ECR authentication                                                 |
+| `com.amazonaws.<region>.ecr.dkr`           | Interface | **Yes**      | Pull container images                                              |
+| `com.amazonaws.<region>.sts`               | Interface | **Yes**      | IMDS-less assume-role                                              |
+| `com.amazonaws.<region>.ssm`               | Interface | **Yes**      | SSM Session Manager                                                |
+| `com.amazonaws.<region>.ssmmessages`       | Interface | **Yes**      | SSM session traffic                                                |
+| `com.amazonaws.<region>.ec2messages`       | Interface | **Yes**      | SSM heartbeats                                                     |
+| `com.amazonaws.<region>.ec2`               | Interface | **Yes**      | Instance metadata, EBS volume operations                           |
+| `com.amazonaws.<region>.sagemaker.api`     | Interface | **Yes**      | HyperPod control plane                                             |
+| `com.amazonaws.<region>.sagemaker.runtime` | Interface | **Yes**      | Runtime calls                                                      |
+| `com.amazonaws.<region>.logs`              | Interface | **Yes**      | CloudWatch logs from lifecycle scripts and health-monitoring-agent |
+| `com.amazonaws.<region>.eks`               | Interface | EKS only     | Required if EKS endpoint is private-only                           |
+| `com.amazonaws.<region>.fsx`               | Interface | If using FSx | Required for FSx for Lustre / OpenZFS                              |
+
+### B.5 EKS VPC / SG alignment with HyperPod
+
+When orchestrator is EKS, the EKS cluster and the HyperPod cluster must share a VPC, and the SG attached to the HyperPod cluster must either be attached to the EKS cluster itself OR the EKS cluster SG must allow inbound from the HyperPod SG. The EKS-default cluster SG works if its outbound rules permit the traffic HyperPod needs.
+
+```bash
+# Verify VPC match
+aws sagemaker describe-cluster --cluster-name <HP>  --region <R> --query 'VpcConfig.{Subnets:Subnets,SGs:SecurityGroupIds}'
+aws eks describe-cluster       --name         <EKS> --region <R> --query 'cluster.resourcesVpcConfig.{VPC:vpcId,SGs:securityGroupIds,ClusterSG:clusterSecurityGroupId}'
+
+# If HyperPod SG is NOT attached to EKS, add an inbound rule on the EKS cluster SG
+# that allows traffic from the HyperPod SG:
+aws ec2 authorize-security-group-ingress \
+  --group-id <EKS_CLUSTER_SG> --region <R> \
+  --ip-permissions "[{\"IpProtocol\":\"-1\",\"UserIdGroupPairs\":[{\"GroupId\":\"<HP_SG>\",\"Description\":\"HyperPod worker traffic\"}]}]"
+```
+
+---
+
+## C: Capacity / AZ
+
+**Signals:** `"Insufficient capacity"` or `"No subnets in the capacity AZ"` in events.
+
+```bash
+aws ec2 describe-instance-type-offerings \
+  --location-type availability-zone \
+  --filters "Name=instance-type,Values=<INSTANCE_TYPE>" \
+  --region <REGION> --query 'InstanceTypeOfferings[*].Location'
+```
+
+Fix: add subnet in the AZ where capacity exists, or use Flexible Training Plans / ODCR.
+
+---
+
+## D: Lifecycle Scripts
+
+**Signals:** `"Lifecycle scripts did not run successfully"` or `"timed out"` in events.
+
+```bash
+# Get cluster ID then fetch logs
+CLUSTER_ID=$(aws sagemaker describe-cluster --cluster-name <C> --region <R> --query 'ClusterArn' --output text | cut -d/ -f2)
+LOG_GROUP="/aws/sagemaker/Clusters/<CLUSTER_NAME>/${CLUSTER_ID}"
+aws logs describe-log-streams --log-group-name "$LOG_GROUP" --region <R> \
+  --query 'logStreams[?starts_with(logStreamName,`LifecycleConfig`)].logStreamName' --output table
+```
+
+**On-node logs** (via `hyperpod-ssm` skill):
+
+```bash
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'cat /var/log/provision/provisioning.log'
+```
+
+| Log Error                                | Fix                                                        |
+| ---------------------------------------- | ---------------------------------------------------------- |
+| `Connect timeout on endpoint URL: s3://` | Add S3 VPC Gateway endpoint                                |
+| `AccessDenied` on S3                     | Add `s3:GetObject` + `s3:ListBucket` to execution role     |
+| Script never exits                       | Add proper exit; check infinite loops; test script locally |
+| `CRLF line terminators`                  | `dos2unix script.sh` before uploading to S3                |
+| `provisioning_parameters.json mismatch`  | Match instance group names exactly between script and API  |
+
+---
+
+## E: Software Versions
+
+**Signals:** NCCL hangs after node replacement, training fails after AMI update, version drift across nodes.
+
+**Delegate to `hyperpod-version-checker` skill** -- compares NVIDIA driver, CUDA, NCCL, EFA installer, OFI NCCL, PyTorch across all nodes.
+
+### Quick spot-check on affected node (via `hyperpod-ssm` skill)
+
+```bash
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'nvidia-smi --query-gpu=driver_version --format=csv,noheader && \
+   nvcc --version | grep "release" && \
+   head -3 /opt/amazon/efa_installed_packages && \
+   python3 -c "import torch; print(torch.__version__, torch.version.cuda)"'
+```
+
+### CUDA driver vs `nvcc` compiler version mismatch
+
+The CUDA _driver_ (reported by `nvidia-smi`) and the CUDA _toolkit_ / `nvcc` (reported by `nvcc --version`) must be a supported pair — a newer toolkit cannot target an older driver. This mismatch commonly causes `CUDA error: no kernel image is available for execution on the device` or segfaults during kernel launch.
+
+```bash
+# On the node:
+nvidia-smi | grep "CUDA Version"         # the maximum CUDA the driver supports
+nvcc --version | grep "release"          # the CUDA toolkit installed
+
+# If driver CUDA < toolkit CUDA: upgrade the driver or downgrade the toolkit.
+# Compatibility matrix:
+#   https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/
+```
+
+### EFA / NCCL / libfabric compatibility
+
+EFA installer version and AWS OFI NCCL version must be paired per the AWS EFA changelog:
+
+```bash
+# Installed versions:
+cat /opt/amazon/efa_installed_packages | head -10
+fi_info -p efa | head -5                 # libfabric + EFA provider
+
+# Official compatibility matrix:
+#   https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-changelog.html
+```
+
+### Container vs host version mismatches
+
+If training works on the host but fails in the container (or vice versa), the cause is almost always one of:
+
+1. **EFA libraries not mounted into the container.** The container must see `/opt/amazon/efa`, `/opt/amazon/openmpi`, and `/dev/infiniband`. Without these, NCCL falls back to TCP silently.
+2. **`LD_LIBRARY_PATH` missing EFA / CUDA paths inside the container.** Export explicitly in the container entrypoint:
+
+   ```bash
+   export LD_LIBRARY_PATH=/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+   ```
+
+3. **PyTorch / TensorFlow built against a different CUDA major than the host driver supports.** Rebuild the container from a base image that matches the host driver's CUDA version (e.g. AWS DLC `pytorch-training:<ver>-gpu-py<ver>-cu<host-major>-ubuntu*`).
+
+After a driver upgrade, CUDA devices may fail to initialize until the node is rebooted — the old kernel module continues to service active processes but new processes see stale state. Reboot via `batch-reboot-cluster-nodes` (see Section F) and re-run training.
+
+### Required job-launcher env vars
+
+`FI_PROVIDER=efa`, `FI_EFA_USE_DEVICE_RDMA=1`, `NCCL_SOCKET_IFNAME=^lo,docker`, `NCCL_TIMEOUT=1200`
+
+### Additional validation guides
+
+- PyTorch environment validation — <https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/validation-and-testing/environment-validation/pytorch-environment-validation>
+- EFA and network-stack validation — <https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/validation-and-testing/environment-validation/efa-validation>
+
+---
+
+## F: Hardware / Auto-Repair
+
+**Signals:** `"hardware failure"` event, EKS label `UnschedulablePendingReplacement`, XID errors, auto-repair not triggering.
+
+```bash
+# Is NodeRecovery enabled?
+aws sagemaker describe-cluster --cluster-name <C> --region <R> \
+  --query 'InstanceGroups[*].{Group:InstanceGroupName,Recovery:NodeRecovery}'
+
+# EKS: check all node repair labels at once
+kubectl get nodes -o custom-columns='NODE:.metadata.name,HEALTH:.metadata.labels.sagemaker\.amazonaws\.com/node-health-status,FAULT:.metadata.labels.sagemaker\.amazonaws\.com/fault-type'
+
+# Check repair events
+aws sagemaker list-cluster-events --cluster-name <C> --region <R> \
+  --query 'ClusterEventSummaries[?contains(Message,`replacement`) || contains(Message,`reboot`) || contains(Message,`hardware`)]' \
+  --output table
+
+# For Slurm: auto-repair triggers only if node reason is exactly "Action:Reboot" or "Action:Replace"
+sinfo -o "%N %T %30E"
+```
+
+**Manual fix:**
+
+```bash
+# Try reboot first (less disruptive)
+aws sagemaker batch-reboot-cluster-nodes --cluster-name <C> --region <R> --node-ids '["<ID>"]'
+# If hardware still bad: replace
+aws sagemaker batch-replace-cluster-nodes --cluster-name <C> --region <R> --node-ids '["<ID>"]'
+```
+
+> **Batch size limit: 25 node IDs per call.** Split larger fleets into multiple calls; >25 returns `ValidationException`.
+
+**Common blockers:** `NodeRecovery=None` (enable it), health agent hasn't detected (check CW logs: `SagemakerHealthMonitoringAgent/<group>/<instance>` stream), lifecycle script failing on replacement instance (check `LifecycleConfig` CW logs), no capacity (see `hyperpod-cluster-debugger`), cluster not InService, Slurm reason not `Action:Reboot`/`Action:Replace`. See [node-issue-catalog.md](node-issue-catalog.md) for detailed patterns.
+
+> Rolling batch replacements (12-15 nodes every ~10 min) = HyperPod health monitoring. **Expected behavior.**
+
+---
+
+## G: GPU/Accelerator
+
+**Signals:** GPU off bus, `deep-health-check-status: Failed`, XID errors, low utilization, ECC errors, thermal throttling, NeuronCore errors.
+
+### G.1: NVIDIA GPUs (p4d/p5/g5/g6)
+
+Run on affected node (via `hyperpod-ssm` skill):
+
+```bash
+# Quick GPU health (single command via SSM):
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'nvidia-smi -L && nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,ecc.errors.uncorrected.volatile.total --format=csv && nvidia-smi -q | grep -E "Xid|Error Type|ECC" && dmesg | grep -i "xid\|nvrm\|pcie\|error" | tail -20'
+```
+
+**ECC thresholds:** CE < 100/day normal; CE > 100/day or **any** UCE -> drain and replace. For detailed GPU diagnostics (NVLink, dmon, stress testing), see [node-issue-catalog.md](node-issue-catalog.md) section 2.
+
+#### G.1.a: Row-remap state (marginal GPU memory)
+
+Row-remapping is the H100 / A100 mechanism that permanently reassigns physical memory rows around defects. The remap state is the most reliable signal of _silent_ GPU memory degradation — training accuracy regressions, sporadic NaNs, and intermittent NCCL hangs that no Xid or ECC count explains.
+
+Query remap state:
+
+```bash
+nvidia-smi --query-remapped-rows=gpu_bus_id,remapped_rows.correctable,remapped_rows.uncorrectable,remapped_rows.pending,remapped_rows.failure \
+  --format=csv
+```
+
+| State                                | Meaning                                                                                                                                        | Action                                                                                                    |
+| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------- |
+| `pending = 0`, `failure = No`        | Healthy                                                                                                                                        | None                                                                                                      |
+| `pending > 0`                        | A remap is staged but needs a GPU reset / reboot to take effect. **Marginal memory** — training can drift silently until the remap is applied. | Reboot the node via `batch-reboot-cluster-nodes` (Section F). Re-check after reboot; pending should be 0. |
+| `pending > 0` persists across reboot | Known firmware edge case where the remap is stuck "Pending" and never promotes to "Failure". GPU silently degrades.                            | Escalate to AWS Support. Drain and replace the node via `batch-replace-cluster-nodes` (Section F).        |
+| `failure = Yes`                      | GPU has exceeded remap capacity; memory is bad.                                                                                                | Drain and replace the node via `batch-replace-cluster-nodes` (Section F).                                 |
+
+`uncorrectable > 0` with `pending = 0` is historical — those rows have already been remapped successfully and are fine going forward, but a high count is a warning sign for the broader hardware cohort.
+
+#### G.1.b: DCGM health and nvvs logs
+
+SageMaker HyperPod runs DCGM as part of its deep-health-check. DCGM findings sit under `/var/log/nvidia-dcgm/` on the node.
+
+```bash
+# Quick health check
+dcgmi health --check -j
+
+# Full nvvs (NVIDIA Validation Suite) log — surfaces row-remap / memtest detail
+ls -1t /var/log/nvidia-dcgm/ | head
+tail -n 200 "$(ls -1t /var/log/nvidia-dcgm/nvvs*.log | head -1)"
+```
+
+Treat only **Fail** / **Warn** verdicts as authoritative. A **Pass** result from `dcgmi -r medium,memtest` on DCGM ≤ 3.3.9 is not authoritative — a known bug can make the combined `medium,memtest` invocation pass even when `memtest` alone would fail. If the triage script reports a DCGM Pass but symptoms persist, re-run with split invocations on the node:
+
+```bash
+# Run medium and memtest separately to work around the DCGM <= 3.3.9 combined-run bug:
+dcgmi diag -r medium
+dcgmi diag -r memtest
+```
+
+For comprehensive data collection before opening a ticket, capture:
+
+```bash
+# NVIDIA's authoritative bug-report bundle (large — collect once, attach to ticket):
+sudo nvidia-bug-report.sh
+# Output: nvidia-bug-report.log.gz in the current directory
+
+# DCGM nvvs logs from the node:
+sudo tar -czf /tmp/nvidia-dcgm-logs.tgz /var/log/nvidia-dcgm/
+```
+
+Attach both to the AWS Support case along with the triage script output.
+
+### G.2: AWS Trainium/Inferentia (trn1/trn2/inf2)
+
+These instances use the **AWS Neuron SDK** instead of CUDA. `nvidia-smi` will not work — use Neuron tools instead.
+
+**Quick Neuron health check (via SSM):**
+
+```bash
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'neuron-ls && neuron-top -n 1 2>/dev/null || echo "neuron-top not available" && dmesg | grep -i "neuron\|nrt\|error" | tail -20'
+```
+
+**Diagnostic commands:**
+
+| Command                       | What It Shows                                               |
+| ----------------------------- | ----------------------------------------------------------- |
+| `neuron-ls`                   | Lists all NeuronCore devices, shows device count and status |
+| `neuron-top`                  | Live utilization (NeuronCore %, memory, model loaded)       |
+| `neuron-monitor`              | JSON metrics stream for programmatic monitoring             |
+| `dmesg \| grep -i neuron`     | Kernel-level Neuron device errors                           |
+| `systemctl status neuron-rtd` | Neuron Runtime daemon status (older AMIs)                   |
+| `pip show neuronx-cc`         | Neuron Compiler version                                     |
+| `pip show torch-neuronx`      | PyTorch Neuron version                                      |
+
+**Expected NeuronCore / NeuronDevice counts** (authoritative source: [AWS Neuron docs — Architecture](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/neuron-core-v2.html)):
+
+| Instance      | NeuronDevices | NeuronCores (total) | Notes                                 |
+| ------------- | ------------- | ------------------- | ------------------------------------- |
+| trn1.2xlarge  | 1             | 2                   | 1x Trainium1 chip, 2 cores per chip   |
+| trn1.32xlarge | 16            | 32                  | 16x Trainium1 chips, 2 cores per chip |
+| trn2.48xlarge | 16            | 128                 | 16x Trainium2 chips, 8 cores per chip |
+| inf2.xlarge   | 1             | 2                   | 1x Inferentia2 chip                   |
+| inf2.8xlarge  | 1             | 2                   | 1x Inferentia2 chip                   |
+| inf2.24xlarge | 6             | 12                  | 6x Inferentia2 chips                  |
+| inf2.48xlarge | 12            | 24                  | 12x Inferentia2 chips                 |
+
+> Verify against `neuron-ls` on the node if the Neuron SDK has been updated — chip-to-core mapping has changed between SDK major versions historically.
+
+**Common Neuron issues:**
+
+| Symptom                                                   | Cause                                                                                                                                                                            | Resolution                                                                                                                                                                                           |
+| --------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `neuron-ls` shows 0 devices                               | Neuron kernel driver not loaded                                                                                                                                                  | Load driver: `sudo modprobe neuron` then re-run `neuron-ls`. If modprobe fails, the AMI is missing the Neuron driver package — rebuild with the AWS Neuron DLAMI or install `aws-neuronx-dkms`.      |
+| `neuron-ls: command not found`                            | Neuron SDK tools not on PATH                                                                                                                                                     | Install the Neuron SDK (`aws-neuronx-tools`) from the AWS Neuron apt/yum repo, or use the AWS Neuron DLAMI which has it pre-installed.                                                               |
+| NeuronCore count < expected                               | Device failure, driver issue, or partial chip detection                                                                                                                          | Reboot the node first. If the count is still low after reboot, treat as a hardware failure and replace via `batch-replace-cluster-nodes` → Section F.                                                |
+| `NRT_UNRECOVERABLE_ERROR` in dmesg or runtime logs        | Unrecoverable hardware fault on a NeuronDevice                                                                                                                                   | Drain the node, then replace it via `batch-replace-cluster-nodes` → Section F. Do not attempt software-only recovery.                                                                                |
+| `neuron-rtd` not running (Neuron SDK < 2.10 / older AMIs) | Neuron runtime daemon crashed. The standalone `neuron-rtd` daemon was deprecated in Neuron SDK 2.10+; newer releases use the `libnrt` runtime linked into the framework process. | Restart the daemon: `sudo systemctl restart neuron-rtd`. If running Neuron SDK ≥ 2.10, this table entry does not apply — restart the training process instead.                                       |
+| OOM on NeuronDevice (HBM exhaustion)                      | Model weights + activations + optimizer state exceed NeuronDevice HBM capacity                                                                                                   | Increase tensor-parallel degree, enable activation checkpointing, or scale up to `trn2.48xlarge`. Do NOT use swap.                                                                                   |
+| Version mismatch across nodes                             | AMI drift after a partial replacement (`aws-neuronx-*` packages differ node-to-node)                                                                                             | Pin `aws-neuronx-dkms`, `aws-neuronx-tools`, `aws-neuronx-collectives`, `neuronx-cc`, and `torch-neuronx` versions in the cluster lifecycle script so all replacements converge on the same release. |
+
+### Accelerator failure → Section F
+
+For either GPU or Neuron device failure, drain and replace the node:
+
+```bash
+# EKS:
+kubectl cordon <node-name>
+kubectl drain <node-name> --ignore-daemonsets --delete-emptydir-data
+# Slurm:
+scontrol update nodename=<node-name> state=drain reason="Accelerator failure -- replacing"
+```
+
+---
+
+## H: Slurm Node Management
+
+**Signals:** Node `down`, `"Node unexpectedly rebooted"`, jobs stuck in PENDING/COMPLETING, `scontrol ping` fails.
+
+### Node Down / Unresponsive
+
+```bash
+sinfo -o "%N %T %30E"          # State + reason
+scontrol show node <NODE>      # Full details
+
+# Test connectivity (try all methods to identify which path is broken):
+ping <node-ip>                  # Basic network
+ssh <node-name>                 # Cross-node SSH
+srun -w <node-name> hostname    # Slurm communication
+
+# On the node (via hyperpod-ssm skill -- SSM is the primary access method):
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'sudo systemctl status slurmd && free -h && df -h'
+# If slurmd is stopped:
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'sudo systemctl start slurmd'
+
+# Fix (on head node):
+scontrol update nodename=<N> state=resume
+```
+
+If disk space is full, clean up before resuming -- see **[Section I: Resource Exhaustion](#i-resource-exhaustion)**.
+
+If rebooting/resuming doesn't help:
+
+```bash
+# Reboot via HyperPod API
+aws sagemaker batch-reboot-cluster-nodes --cluster-name <C> --region <R> --node-ids '["<ID>"]'
+# If hardware issue: replace
+aws sagemaker batch-replace-cluster-nodes --cluster-name <C> --region <R> --node-ids '["<ID>"]'
+```
+
+### Node Unexpectedly Rebooted
+
+Node is running but Slurm marked it down as a protective measure. Fix:
+
+```bash
+# On node (via SSM): enable and start slurmd
+sudo systemctl enable slurmd && sudo systemctl start slurmd
+# On head node: resume the node
+scontrol update nodename=<N> state=resume
+```
+
+Prevention: always `scontrol update state=drain` before intentional reboot, then `state=resume` after.
+
+### Jobs Stuck PENDING/COMPLETING -> Restart slurmctld
+
+**When to restart:** PENDING with `REASON=RESOURCES` despite available nodes, GRES miscalculation, COMPLETING after replacement, `scontrol ping` fails.
+
+```bash
+sudo systemctl restart slurmctld && sinfo && squeue
+# If completely hung: sudo pkill -9 slurmctld && sudo systemctl start slurmctld
+```
+
+Restart preserves running jobs, pending queue, and node states. Resets memory cache and resource calculations.
+
+### Translate Slurm node name -> Instance ID (for AWS API)
+
+```bash
+NODE="ip-10-1-2-3"
+IP=$(echo $NODE | sed 's/ip-//; s/-/./g')
+sudo python3 -c "import json; d=json.load(open('/opt/ml/config/resource_config.json')); [print(n) for n in d.get('InstanceGroups',[]) if '$IP' in str(n)]"
+# Or:
+aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,DNS:PrivateDnsHostname}' --output table
+```
+
+**For large clusters** (recurring lookups): use `dump_cluster_nodes_info.py` to generate a CSV:
+
+```bash
+wget https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/main/1.architectures/5.sagemaker-hyperpod/tools/dump_cluster_nodes_info.py
+python3 dump_cluster_nodes_info.py --cluster-name <C>
+cat cluster_nodes_info.csv | grep "10.1.2.3"
+```
+
+---
+
+## I: Resource Exhaustion
+
+**Signals:** Disk full, OOM kills, `"Cannot allocate memory"` at `os.fork()`, inode exhaustion, `/dev/shm` full.
+
+### Diagnose (via `hyperpod-ssm` skill on the node)
+
+```bash
+df -h && df -i                                             # Disk + inodes
+free -h                                                    # RAM
+df -h /dev/shm                                             # Shared memory
+dmesg | grep -i oom | tail -10                             # OOM kills
+sudo du -h --max-depth=1 / 2>/dev/null | sort -hr | head -15
+sudo du -sh /var/log/* /tmp/* 2>/dev/null | sort -hr | head -10
+cat /proc/meminfo | grep Huge                              # Huge pages config
+```
+
+### I.1: "Cannot allocate memory" at os.fork()
+
+**Symptoms:** `OSError: [Errno 12] Cannot allocate memory` during `os.fork()`, DataLoader crashes, `Failed to register memory` during EFA init, segmentation faults during NCCL operations.
+
+**Fix (in order):**
+
+1. `export FI_EFA_USE_HUGE_PAGE=0` — most common fix; add to job script, container entrypoint, or `/etc/environment` for persistence. Disabling EFA huge pages avoids the fork-time memory-registration path that fails when huge pages aren't pre-allocated.
+2. Increase shared memory:
+   - Docker: `docker run --shm-size=8g ...`
+   - Kubernetes:
+
+     ```yaml
+     volumes:
+     - name: dshm
+       emptyDir:
+         medium: Memory
+         sizeLimit: 8Gi
+     volumeMounts:
+     - { name: dshm, mountPath: /dev/shm }
+     ```
+
+3. Tune PyTorch DataLoader:
+   - `num_workers=4` (start here; too many workers × forked process = exhausted memory)
+   - `persistent_workers=True` — reuse workers instead of recreating every epoch
+   - `pin_memory=False` if you are not bottlenecked on host→GPU copy; `pin_memory=True` locks host RAM, which compounds the fork-time pressure
+4. Reduce batch size to lower parent-process memory footprint before fork
+5. Verify the environment:
+
+   ```bash
+   free -h                              # RSS + available memory
+   df -h /dev/shm                       # /dev/shm capacity and usage
+   cat /proc/meminfo | grep Huge        # huge-page allocation
+   ```
+
+**Huge-page configuration (only if you need `FI_EFA_USE_HUGE_PAGE=1`):**
+
+If another workload legitimately needs EFA huge pages, pre-allocate them rather than leaving the default of 0:
+
+```bash
+# Current allocation
+cat /proc/sys/vm/nr_hugepages
+
+# Allocate 1024 × 2MB = 2 GiB (requires root)
+echo 1024 | sudo tee /proc/sys/vm/nr_hugepages
+
+# Persist across reboots
+echo 'vm.nr_hugepages=1024' | sudo tee -a /etc/sysctl.d/99-hugepages.conf
+```
+
+Only then set `FI_EFA_USE_HUGE_PAGE=1`. Setting it without pre-allocation is the root cause of the fork-time failure in the first place.
+
+See [node-issue-catalog.md](node-issue-catalog.md) section 4 for additional examples and the full K8s YAML template.
+
+### I.2: Root Volume Exhausted
+
+**Important:** The default HyperPod root volume is a **100 GB EBS volume** (per AWS docs:
+"the default root volume of any fresh instance is mounted to `/tmp` only with a 100 GB EBS volume"
+— [Running Docker containers on a Slurm compute node on HyperPod](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-run-jobs-slurm-docker.html)).
+**Do not plan to grow the root volume after cluster creation** — redirect heavy data to
+`/opt/sagemaker` (secondary EBS, sized at instance-group creation) or `/opt/dlami/nvme`
+(NVMe instance store on P/G families). For persistent shared storage, use FSx for Lustre /
+OpenZFS or S3.
+
+**Available storage locations:**
+
+| Mount Point       | Type                                                 | Persistence              | Best For                                      |
+| ----------------- | ---------------------------------------------------- | ------------------------ | --------------------------------------------- |
+| `/opt/sagemaker`  | Secondary EBS (configurable size per instance group) | Persistent               | Checkpoints, app data, logs, container images |
+| `/opt/dlami/nvme` | NVMe instance store (p4d/p5/trn1)                    | **Lost on stop/replace** | Scratch data, caches, temp files              |
+| FSx for Lustre    | Shared filesystem                                    | Persistent               | Large datasets, shared models                 |
+| FSx for OpenZFS   | Shared filesystem                                    | Persistent               | Mixed workloads, snapshots                    |
+| Amazon S3         | Object storage                                       | Persistent               | Large datasets, archives                      |
+
+**Quick cleanup:**
+
+```bash
+sudo journalctl --vacuum-size=500M
+sudo rm -f /var/log/*.log.* /var/log/*/*.gz
+sudo apt-get clean 2>/dev/null || sudo yum clean all 2>/dev/null
+docker system prune -a 2>/dev/null
+```
+
+**Redirect data to correct storage:**
+
+```bash
+# Environment variables (add to job scripts or /etc/environment)
+export TORCH_HOME=/opt/sagemaker/torch_cache
+export HF_HOME=/opt/sagemaker/huggingface_cache
+export TRANSFORMERS_CACHE=/opt/sagemaker/transformers_cache
+export TMPDIR=/opt/dlami/nvme/tmp && mkdir -p $TMPDIR
+
+# In training scripts
+checkpoint_dir = "/opt/sagemaker/checkpoints"
+cache_dir = "/opt/dlami/nvme/cache"
+```
+
+For K8s pods, mount `/opt/sagemaker` and `/opt/dlami/nvme` as `hostPath` volumes. Default lifecycle scripts already configure container runtimes to use these paths. **Prevention:** size secondary EBS at 2-3x estimated needs when creating instance groups.
+
+### I.3: OOM Events in dmesg
+
+**Signal from the triage script:** `[P1] OOM events on node <i-xxx>`.
+
+**Diagnose:**
+
+```bash
+# Full OOM context (what got killed + who invoked the OOM killer):
+sudo dmesg -T | grep -i -B2 -A30 "Out of memory" | tail -80
+
+# Running processes with their memory footprint:
+ps auxf --sort=-%mem | head -20
+
+# Systemd-level OOM tracking:
+sudo journalctl -k --since "1 hour ago" | grep -i oom
+```
+
+**Root causes and remediation:**
+
+| Signal                                        | Root cause                                               | Remediation                                                                                |
+| --------------------------------------------- | -------------------------------------------------------- | ------------------------------------------------------------------------------------------ |
+| `Killed process <pid> (python3) total-vm:...` | Training process exceeded cgroup memory limit            | Raise pod `resources.limits.memory`, reduce batch size, or reduce DataLoader workers       |
+| `invoked oom-killer` on system-wide memory    | Node as a whole ran out of memory (too many pods)        | Reduce pod density; increase instance size; check for memory leaks in sidecars             |
+| Repeated OOM within minutes                   | OOM loop — systemd or orchestrator restarting the victim | Before restarting, capture core: `sudo coredumpctl list`; identify leak via py-spy / pprof |
+
+**Blast radius:** understanding-only. No remediation command on the node itself changes state; the fix is in the workload spec (pod limits, batch size).
+
+### I.4: Inode Exhaustion
+
+**Signal from the triage script:** `[P1] Inode exhaustion <N>% on /`.
+
+Root volume has a fixed inode count. Many small files (pip caches, HF transformers cache, container image layers) can exhaust inodes long before disk space.
+
+**Diagnose:**
+
+```bash
+df -i /
+# Count files per top-level directory:
+for d in /var /opt /root /home /tmp; do
+  echo "$d: $(sudo find "$d" -xdev 2>/dev/null | wc -l) entries"
+done
+
+# Top 20 inode hoarders:
+sudo find / -xdev -type f 2>/dev/null | awk -F/ '{print $1"/"$2"/"$3}' | sort | uniq -c | sort -rn | head -20
+```
+
+**Remediation (commands the customer runs on the node via SSM):**
+
+```bash
+# Clear pip / HF / transformers caches (safe if no training is mid-flight):
+rm -rf ~/.cache/pip/* 2>/dev/null
+rm -rf ~/.cache/huggingface/* 2>/dev/null
+
+# Clean rotated journals:
+sudo journalctl --vacuum-size=200M
+
+# Clean stopped container images (frees inodes held by layers):
+docker system prune -a --volumes -f 2>/dev/null || true
+
+# Recheck:
+df -i /
+```
+
+**Prevention:** redirect caches to `/opt/sagemaker` or `/opt/dlami/nvme` (see I.2 env-var table), which are separate filesystems with their own inode tables.
+
+### I.5: Time Sync Not Healthy
+
+**Signal from the triage script:** `[P1] Node <i-xxx> time sync not healthy`.
+
+Clock drift > a few seconds breaks TLS / IAM SigV4 (every AWS API call fails with `SignatureDoesNotMatch` or token expiry) and Slurm accounting (job epochs disagree between controller and nodes).
+
+**Diagnose:**
+
+```bash
+# chrony (default on Amazon Linux 2023):
+chronyc tracking
+chronyc sources -v
+
+# systemd-timesyncd / timedatectl (fallback on some images):
+timedatectl status
+
+# Actual drift vs a reference:
+ntpdate -q pool.ntp.org 2>/dev/null || chronyc makestep
+```
+
+**Remediation decisions:**
+
+| Finding                                              | Remediation                                                                                                                                                                 |
+| ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Leap status : Not synchronised` + chrony is running | Restart chrony: `sudo systemctl restart chronyd`; then `chronyc sources -v` to confirm it has peers                                                                         |
+| `Leap status : Not synchronised` + chrony is stopped | Start it: `sudo systemctl enable --now chronyd`                                                                                                                             |
+| Drift > 1s but chrony is "synchronised"              | One-shot correction: `sudo chronyc makestep` (safe; pushes the clock to match its peers)                                                                                    |
+| `chronyc sources` shows zero reachable servers       | Network/firewall issue — node cannot reach NTP pool. Check egress on UDP/123; check `/etc/chrony.conf` server list; if in a private subnet, ensure NTP traffic is permitted |
+
+**Blast radius:** `chronyc makestep` and `systemctl restart chronyd` are low-risk on a node that is not running latency-sensitive training. During active training, prefer slewing (gradual correction — the default chrony behavior) over stepping; a step can confuse monotonic-time-based code.
+
+---
+
+## J: Configuration
+
+**Signals:** p5.48xlarge shows 96 vCPU instead of 192, CFN error `"DisableHyperThreading not supported"`.
+
+Console "Advanced Configuration" defaults `ThreadsPerCore=1`. Fix:
+
+```bash
+aws sagemaker update-cluster --cluster-name <C> --region <R> \
+  --instance-groups '[{"InstanceGroupName":"<G>","InstanceType":"ml.p5.48xlarge",
+    "InstanceCount":<N>,"ThreadsPerCore":2,
+    "LifeCycleConfig":{"SourceS3Uri":"<URI>","OnCreate":"<SCRIPT>"},
+    "ExecutionRole":"<ROLE>"}]'
+```
+
+CFN `UpdateCluster` must always include `ThreadsPerCore` even if not set at creation.
+
+---
+
+## K: Node Access via SSM
+
+Direct SSH is not available on HyperPod — SSM is the primary node access method. The SSM target format and connection procedure is **identical for both EKS and Slurm** clusters — same plugin, same IAM permissions, same VPC endpoints. The only difference is what commands you run after connecting (kubectl vs scontrol).
+
+### Quick-Start: Get Connected in 4 Commands
+
+```bash
+CLUSTER_NAME="my-hyperpod-cluster"
+REGION="us-east-1"
+
+# 1. Get cluster ID (last segment of the ARN — NOT the cluster name)
+CLUSTER_ID=$(aws sagemaker describe-cluster \
+  --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+  --query 'ClusterArn' --output text | cut -d/ -f2)
+echo "Cluster ID: $CLUSTER_ID"
+
+# 2. List all nodes with group name, instance ID, and status
+aws sagemaker list-cluster-nodes --cluster-name "$CLUSTER_NAME" \
+  --region "$REGION" \
+  --query 'ClusterNodeSummaries[*].[InstanceGroupName,InstanceId,InstanceStatus.Status]' \
+  --output table
+
+# 3. Build the SSM target (substitute GROUP and INSTANCE_ID from step 2)
+TARGET="sagemaker-cluster:${CLUSTER_ID}_<GROUP>-<INSTANCE_ID>"
+echo "SSM Target: $TARGET"
+
+# 4. Connect
+aws ssm start-session --target "$TARGET" --region "$REGION"
+```
+
+### If You Only Have a Slurm Node Name (e.g., ip-10-1-2-3)
+
+```bash
+# Convert Slurm node name to instance ID
+NODE_IP=$(echo "ip-10-1-2-3" | sed 's/ip-//; s/-/./g')
+aws sagemaker list-cluster-nodes --cluster-name "$CLUSTER_NAME" \
+  --region "$REGION" \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,DNS:PrivateDnsHostname,Group:InstanceGroupName}' \
+  --output table | grep "$NODE_IP"
+```
+
+### Non-Interactive Command Execution
+
+```bash
+# Run a single command on a node (no interactive shell):
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target "$TARGET" --region "$REGION" \
+  'nvidia-smi && free -h && df -h'
+```
+
+### Essential On-Node Checks
+
+| Check                  | Command                                                 |
+| ---------------------- | ------------------------------------------------------- |
+| System health          | `uptime && free -h && df -h`                            |
+| GPU (NVIDIA)           | `nvidia-smi`                                            |
+| Accelerator (Trainium) | `neuron-ls && neuron-top -n 1`                          |
+| EFA                    | `fi_info -p efa`                                        |
+| NCCL/EFA env           | `env \| grep -E "FI_\|NCCL_"`                           |
+| OOM / errors           | `dmesg \| grep -i "oom\|xid\|nvrm\|neuron" \| tail -20` |
+| Provisioning           | `cat /var/log/provision/provisioning.log`               |
+| Slurmd (Slurm only)    | `sudo systemctl status slurmd`                          |
+
+### SSM Prerequisites
+
+**SessionManagerPlugin installation:**
+
+```bash
+# Verify installed:
+session-manager-plugin --version
+# If not found, install:
+#   macOS:  brew install --cask session-manager-plugin
+#   Linux (DEB): curl -o session-manager-plugin.deb "https://s3.amazonaws.com/session-manager-downloads/plugin/latest/ubuntu_64bit/session-manager-plugin.deb" && sudo dpkg -i session-manager-plugin.deb
+#   Linux (RPM): curl -o session-manager-plugin.rpm "https://s3.amazonaws.com/session-manager-downloads/plugin/latest/linux_64bit/session-manager-plugin.rpm" && sudo yum install -y session-manager-plugin.rpm
+# After install, verify:
+session-manager-plugin --version
+```
+
+**Required IAM permissions:**
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [{
+    "Effect": "Allow",
+    "Action": [
+      "sagemaker:DescribeCluster",
+      "sagemaker:ListClusterNodes",
+      "ssm:StartSession",
+      "ssm:TerminateSession"
+    ],
+    "Resource": "*"
+  }]
+}
+```
+
+### SSM Not Working?
+
+| Error                                                     | Root Cause                                             | Fix                                                                                                                                  |
+| --------------------------------------------------------- | ------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------ |
+| `SessionManagerPlugin is not found`                       | SSM plugin not installed                               | See installation steps above; restart terminal after install                                                                         |
+| `Target is not connected`                                 | Wrong target format, wrong region, or node not running | Use `sagemaker-cluster:` prefix (not bare `i-xxx`); verify region; check node is `Running`                                           |
+| `InvalidTarget` / `ValidationException`                   | Malformed target string                                | Ensure format is exactly `sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>` — CLUSTER_ID is the ARN suffix, not the cluster name |
+| `Access denied`                                           | Missing IAM permissions                                | Need `ssm:StartSession`, `sagemaker:DescribeCluster`, `sagemaker:ListClusterNodes`                                                   |
+| Connection timeout                                        | SSM agent unreachable                                  | Check VPC endpoints exist (SSM, SSMMessages, EC2Messages); verify node is in `Running` state                                         |
+| `session-manager-plugin: command not found` after install | Not in PATH                                            | Re-open terminal; or add install dir to PATH                                                                                         |
+
+---
+
+## L: Log Collection
+
+**Delegate to `hyperpod-issue-report` skill** for comprehensive S3-stored diagnostics.
+
+| Log               | Group                                 | Stream                                                 |
+| ----------------- | ------------------------------------- | ------------------------------------------------------ |
+| Lifecycle scripts | `/aws/sagemaker/Clusters/<name>/<id>` | `LifecycleConfig/<group>/<instance-id>`                |
+| Health monitoring | `/aws/sagemaker/Clusters/<name>/<id>` | `SagemakerHealthMonitoringAgent/<group>/<instance-id>` |
+
+---
+
+## M: Container Runtime
+
+**Signals:** CrashLoopBackOff, ImagePullBackOff, RunContainerError, container OOM kills (EKS clusters).
+
+### Diagnose
+
+```bash
+# Pod-level (from workstation):
+kubectl describe pod <POD> -n <NAMESPACE>
+kubectl logs <POD> -n <NAMESPACE> --previous   # logs from last crash
+
+# On-node (via SSM):
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'sudo crictl ps -a | head -20 && sudo crictl logs --tail 30 <CONTAINER_ID> && journalctl -u containerd --no-pager -n 50'
+```
+
+| Symptom                   | Cause                               | Fix                                                                     |
+| ------------------------- | ----------------------------------- | ----------------------------------------------------------------------- |
+| `CrashLoopBackOff`        | Training process crashes repeatedly | Check `kubectl logs --previous`; likely OOM, missing lib, or NCCL error |
+| `OOMKilled`               | Container exceeded memory limit     | Increase `resources.limits.memory` in pod spec or reduce batch size     |
+| `ImagePullBackOff`        | Image not found or auth failure     | Verify ECR URI, check node has ECR access via VPC endpoint or internet  |
+| `RunContainerError`       | Runtime can't start container       | Check `journalctl -u containerd`; may be disk full or GPU device issue  |
+| `ContainerCreating` stuck | Volume mount or device plugin issue | Check EFA device plugin DaemonSet, volume mounts, and CSI drivers       |
+
+### containerd Issues
+
+```bash
+# On-node via SSM:
+sudo systemctl status containerd
+sudo journalctl -u containerd --since "1 hour ago" | grep -i error
+sudo crictl info   # runtime config and storage usage
+```
+
+If containerd is crashing or OOM, check disk usage on `/var/lib/containerd` (lives on root 100GB volume). Move container storage to `/opt/sagemaker` if needed.
+
+---
+
+## N: Kernel & System
+
+**Signals:** Kernel panic, watchdog timeout, NMI, system hang, unexpected reboot not explained by HyperPod health monitoring.
+
+### Diagnose (via SSM)
+
+```bash
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'dmesg | grep -iE "panic|watchdog|hung_task|NMI|nvrm|Call Trace|BUG:" | tail -30 && journalctl -b -1 --no-pager -n 50 2>/dev/null || echo "No previous boot journal"'
+```
+
+| Signal                       | Cause                                        | Fix                                                                                        |
+| ---------------------------- | -------------------------------------------- | ------------------------------------------------------------------------------------------ |
+| `Kernel panic - not syncing` | Critical kernel error                        | Check full dmesg; if `RIP: nvrm` → NVIDIA driver crash → reboot + replace if recurring     |
+| `watchdog: BUG: soft lockup` | CPU stuck in kernel code                     | Often NVLink/PCIe issue on GPU instances; reboot, then replace if recurring                |
+| `hung_task_timeout`          | Process stuck in uninterruptible sleep       | Check for disk I/O issues (`iostat`), NFS hangs, or dead-locked GPU operations             |
+| `NMI received`               | Non-maskable interrupt (hardware)            | Hardware issue — drain and replace node → Section F                                        |
+| `nvrm: Xid` + system hang    | NVIDIA driver crash leading to system freeze | Reboot via `batch-reboot-cluster-nodes`; replace if repeating                              |
+| `mce: [Hardware Error]`      | Machine check exception                      | CPU/memory hardware failure — replace node                                                 |
+| Repeated unexpected reboots  | Health agent triggered reboot for HW fault   | Check CloudWatch `SagemakerHealthMonitoringAgent` logs; expected if auto-repair is working |
+
+### Previous Boot Logs
+
+```bash
+# See what happened before the last reboot:
+journalctl -b -1 --no-pager | tail -100
+# Check reboot reason:
+last reboot | head -5
+who -b
+```
+
+If kernel panics recur on the same node after reboot, the hardware is likely bad — drain and replace via Section F.
+
+---
+
+## O: CNI / Pod Networking
+
+VPC CNI plugin (`aws-node`) failures prevent pods from getting IP addresses, breaking all pod networking on affected nodes.
+
+### Diagnosis
+
+```bash
+# 1. Check aws-node DaemonSet status:
+kubectl get ds -n kube-system aws-node
+# Look for: DESIRED vs READY mismatch = unhealthy nodes
+
+# 2. Find crashing aws-node pods:
+kubectl get pods -n kube-system -l k8s-app=aws-node -o wide
+# Look for: CrashLoopBackOff, Error, or high RESTARTS count
+
+# 3. Check logs on the failing pod:
+kubectl logs -n kube-system <aws-node-pod> -c aws-node --tail=100
+kubectl logs -n kube-system <aws-node-pod> -c aws-eks-nodeagent --tail=50
+
+# 4. Check IPAMD (IP Address Management Daemon) specifically:
+kubectl logs -n kube-system <aws-node-pod> -c aws-node --tail=100 | grep -iE "ipamd|eni|ip pool|failed"
+
+# 5. Check if kube-proxy and CoreDNS are also affected:
+kubectl get pods -n kube-system -l k8s-app=kube-proxy
+kubectl get pods -n kube-system -l k8s-app=kube-dns
+```
+
+### Common Error Patterns
+
+| Log Pattern                                                       | Root Cause                                        | Fix                                                           |
+| ----------------------------------------------------------------- | ------------------------------------------------- | ------------------------------------------------------------- |
+| `gRPC connection refused 127.0.0.1:50051`                         | IPAMD not running, aws-node init container failed | Restart aws-node pod; check node IAM role                     |
+| `Failed to create ENI` / `ENI limit reached`                      | Instance type ENI limit exhausted                 | Reduce pod density or enable prefix delegation                |
+| `UnauthorizedOperation: ec2:CreateNetworkInterface`               | Node IAM role missing EC2 permissions             | Add `AmazonEKS_CNI_Policy` to the node role                   |
+| `Failed to pull image` on aws-node                                | ECR endpoint not reachable in private VPC         | Add `com.amazonaws.<region>.ecr.api` and `.dkr` VPC endpoints |
+| `Insufficient IP addresses` / `subnet has no available addresses` | VPC subnet exhausted                              | Add a larger subnet or use prefix delegation                  |
+| `ipamd: failed to increase IP pool`                               | Cannot allocate warm pool IPs                     | Check ENI limits, subnet capacity, and SG rules               |
+
+### Fixes
+
+```bash
+# Restart a single crashing aws-node pod:
+kubectl delete pod -n kube-system <aws-node-pod-name>
+
+# Enable prefix delegation for higher pod density (reduces ENI consumption):
+kubectl set env daemonset aws-node -n kube-system ENABLE_PREFIX_DELEGATION=true
+
+# Check node's ENI capacity:
+kubectl get node <NODE> -o json | python3 -c "
+import sys, json
+n = json.load(sys.stdin)
+alloc = n.get('status',{}).get('allocatable',{})
+cap = n.get('status',{}).get('capacity',{})
+print(f'Pod ENIs: allocatable={alloc.get(\"vpc.amazonaws.com/pod-eni\",\"N/A\")}  capacity={cap.get(\"vpc.amazonaws.com/pod-eni\",\"N/A\")}')
+print(f'Pods:     allocatable={alloc.get(\"pods\",\"N/A\")}  capacity={cap.get(\"pods\",\"N/A\")}')
+"
+
+# Verify node IAM role has CNI permissions:
+# The node role must have AmazonEKS_CNI_Policy or equivalent:
+#   ec2:CreateNetworkInterface, ec2:DeleteNetworkInterface,
+#   ec2:DescribeNetworkInterfaces, ec2:AssignPrivateIpAddresses,
+#   ec2:UnassignPrivateIpAddresses, ec2:AttachNetworkInterface, ec2:DetachNetworkInterface
+
+# Check VPC subnet free IPs:
+aws ec2 describe-subnets --subnet-ids <SUBNET_ID> --region <REGION> \
+  --query 'Subnets[0].{SubnetId:SubnetId,AvailableIPs:AvailableIpAddressCount,CIDR:CidrBlock}'
+```
+
+### When to Escalate
+
+If `aws-node` keeps crashing after restart with no clear error in logs, and IAM + VPC + subnet are all correct, escalate to AWS Support with the output of:
+
+```bash
+kubectl describe ds -n kube-system aws-node
+kubectl logs -n kube-system -l k8s-app=aws-node --tail=200
+kubectl get nodes -o wide
+```
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-issue-catalog.md b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-issue-catalog.md
new file mode 100644
index 00000000..84475fd4
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-issue-catalog.md
@@ -0,0 +1,399 @@
+# Node Issue Catalog
+
+Detailed issue patterns for HyperPod node-level problems, organized by signal type.
+Each entry includes: symptoms, root cause, diagnostic commands, and resolution steps.
+
+---
+
+## 1. EFA / Security Group Issues
+
+### 1.1 EFA Health Check Failure During Cluster Creation
+
+**Symptoms:**
+
+- Cluster event: `"EFA health checks did not run successfully"`
+- Cluster creation fails before lifecycle scripts execute
+- CloudWatch lifecycle logs are empty (scripts never ran)
+
+**Root Cause:** Security group missing self-referencing rules for EFA RDMA traffic.
+
+**Diagnostic:**
+
+```bash
+bash scripts/check-efa-sg.sh --cluster <CLUSTER> --region <REGION>
+```
+
+**Fix:**
+
+```bash
+SG=<sg-id>; R=<region>
+aws ec2 authorize-security-group-ingress --group-id $SG --region $R \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"'"$SG"'"}]}]'
+aws ec2 authorize-security-group-egress --group-id $SG --region $R \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"'"$SG"'"}]}]'
+```
+
+### 1.2 EFA Not Working After Node Replacement
+
+**Symptoms:**
+
+- Training hangs at NCCL init after replacing one or more nodes
+- `fi_info -p efa` returns no providers on replacement node
+- Other nodes work fine
+
+**Root Cause:** EFA driver not loaded on replacement node, or version drift after AMI update.
+
+**Diagnostic (on affected node via SSM):**
+
+```bash
+lsmod | grep efa                           # Should show efa module
+fi_info -p efa                             # Should list EFA endpoints
+cat /opt/amazon/efa_installed_packages     # Check EFA version
+```
+
+**Fix:** Compare versions with working nodes using `hyperpod-version-checker` skill. If versions differ, the lifecycle script may need updating.
+
+### 1.3 EFA Intermittent Failures
+
+**Symptoms:**
+
+- Training works sometimes but randomly hangs
+- NCCL logs show TCP fallback on some iterations
+- `Using network TCP` instead of EFA in NCCL debug output
+
+**Root Cause:** EFA interface flapping, network interface errors, or PCIe issues.
+
+**Diagnostic (on affected node via SSM):**
+
+```bash
+ip -s link show 2>/dev/null | grep -A5 "RX\|TX"   # Check for errors/drops
+dmesg | grep -i "efa\|pcie\|error" | tail -20
+bash scripts/check-node-reachability.sh             # Full EFA health check
+```
+
+---
+
+## 2. GPU / Accelerator Issues
+
+### 2.1 GPU Off Bus (XID 79)
+
+**Symptoms:**
+
+- `nvidia-smi` shows fewer GPUs than expected
+- dmesg: `Xid 79: GPU has fallen off the bus`
+- Training fails with CUDA device not found
+
+**Root Cause:** Hardware failure — GPU disconnected from PCIe bus.
+
+**Diagnostic:**
+
+```bash
+nvidia-smi -L | wc -l                  # Count visible GPUs
+dmesg | grep -i "xid.*79\|off the bus"
+lspci | grep -i nvidia | wc -l         # Physical GPU count
+```
+
+**Fix:** Drain the node and request replacement:
+
+```bash
+# EKS:
+kubectl cordon <node>; kubectl drain <node> --ignore-daemonsets --delete-emptydir-data
+# Then:
+aws sagemaker batch-replace-cluster-nodes --cluster-name <C> --region <R> --node-ids '["<ID>"]'
+```
+
+### 2.2 ECC Errors (Correctable → Uncorrectable)
+
+**Symptoms:**
+
+- `nvidia-smi -q` shows non-zero ECC error counts
+- Training produces NaN values or incorrect gradients
+- Performance degradation on specific GPU
+
+**Root Cause:** Memory bit errors. Correctable (CE) are normal in small numbers. Uncorrectable (UCE) indicate failing hardware.
+
+**Diagnostic:**
+
+```bash
+nvidia-smi -q | grep -A 10 "ECC Errors"
+nvidia-smi --query-gpu=index,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total --format=csv
+```
+
+**Thresholds:**
+
+- CE < 100/day: normal, monitor
+- CE > 100/day or any UCE: drain node, request replacement
+
+### 2.3 Thermal Throttling
+
+**Symptoms:**
+
+- GPU utilization drops periodically
+- `nvidia-smi dmon` shows temperature > 80C
+- Training throughput varies over time
+
+**Diagnostic:**
+
+```bash
+nvidia-smi dmon -s pucvmet -d 5    # Live monitoring (5 sec intervals)
+nvidia-smi --query-gpu=temperature.gpu,power.draw,clocks.current.sm --format=csv
+```
+
+**Fix:** Typically indicates a cooling issue. Drain and replace if persistent.
+
+### 2.4 NVLink Failures
+
+**Symptoms:**
+
+- Inter-GPU communication slow (same node)
+- `nvidia-smi nvlink --status` shows inactive links
+- XID 74 in dmesg
+
+**Diagnostic:**
+
+```bash
+nvidia-smi nvlink --status
+nvidia-smi topo -m    # Should show NV links, not PHB
+dmesg | grep -i "xid.*74\|nvlink"
+```
+
+**Fix:** Drain and replace node if NVLink is down.
+
+---
+
+## 3. Slurm Management Issues
+
+### 3.1 Node Down — "Node Unexpectedly Rebooted"
+
+**Symptoms:**
+
+- `sinfo` shows node as `down`
+- Reason: `"Node unexpectedly rebooted"`
+- Node is running and accessible
+
+**Root Cause:** Node was rebooted without notifying Slurm. slurmd may not have restarted.
+
+**Diagnostic:**
+
+```bash
+scontrol show node <NODE> | grep -E "State|Reason"
+# On node via SSM:
+sudo systemctl status slurmd
+```
+
+**Fix:**
+
+```bash
+# On node:
+sudo systemctl start slurmd
+sudo systemctl enable slurmd    # Prevent recurrence
+# On head node:
+scontrol update nodename=<N> state=resume
+```
+
+### 3.2 Jobs Stuck in COMPLETING After Node Replacement
+
+**Symptoms:**
+
+- Jobs stay in COMPLETING state indefinitely
+- `squeue` shows jobs stuck
+- Node was recently replaced
+
+**Root Cause:** slurmctld cached the COMPLETING state and keeps waiting for the old node that no longer exists.
+
+**Fix:**
+
+```bash
+sudo systemctl restart slurmctld
+sinfo && squeue    # Verify recovery
+```
+
+### 3.3 GRES (GPU) Miscalculation
+
+**Symptoms:**
+
+- Jobs stuck PENDING with `REASON=RESOURCES` despite free GPUs
+- `scontrol show node` shows wrong GRES count
+
+**Root Cause:** GRES resources not released after job completion or node replacement.
+
+**Fix:**
+
+```bash
+sudo systemctl restart slurmctld    # Resets resource calculations
+scontrol show node <NODE> | grep Gres   # Verify GRES count
+```
+
+---
+
+## 4. Resource Exhaustion Issues
+
+### 4.1 Root Volume Full (100 GB Default)
+
+**Symptoms:**
+
+- `df -h /` shows 100% used
+- Container pulls fail
+- Log writes fail
+- Training cannot write checkpoints
+
+**Root Cause:** HyperPod's default root volume is 100 GB EBS (per AWS docs). Container
+images, logs, or training artifacts written to the root path fill it quickly. Plan to
+redirect data off root rather than try to grow it post-creation.
+
+**Diagnostic:**
+
+```bash
+df -h
+sudo du -h --max-depth=1 / 2>/dev/null | sort -hr | head -15
+```
+
+**Fix:** Redirect to alternative storage:
+
+- `/opt/sagemaker` — secondary EBS (configurable size)
+- `/opt/dlami/nvme` — NVMe local storage (p4d/p5/trn1)
+- Set: `TORCH_HOME=/opt/sagemaker/torch_cache`, `HF_HOME=/opt/sagemaker/huggingface_cache`
+
+### 4.2 os.fork() Memory Error with EFA
+
+**Symptoms:**
+
+- `OSError: [Errno 12] Cannot allocate memory` at `os.fork()`
+- Happens with PyTorch DataLoader + EFA
+- Segfaults during NCCL init
+
+**Root Cause:** EFA huge pages interfere with process forking.
+
+**Fix (in order):**
+
+1. `export FI_EFA_USE_HUGE_PAGE=0`
+2. Increase container shm: `--shm-size=8g` or K8s `emptyDir.medium=Memory`
+3. Reduce `num_workers`; set `persistent_workers=True`
+
+### 4.3 OOM Kills (Exit Code 137)
+
+**Symptoms:**
+
+- Pod/process killed with exit code 137
+- dmesg shows `oom-kill` messages
+- Container shows `OOMKilled` reason
+
+**Diagnostic:**
+
+```bash
+dmesg | grep -i "oom\|killed process" | tail -10
+free -h
+```
+
+**Fix:**
+
+- Enable gradient checkpointing: `model.gradient_checkpointing_enable()`
+- Use FSDP/ZeRO for model sharding
+- Reduce batch size
+- Increase K8s memory limits
+
+### 4.4 Inode Exhaustion
+
+**Symptoms:**
+
+- `df -i` shows 100% inode usage
+- `No space left on device` despite free disk space
+- Small files (pip caches, logs) consuming all inodes
+
+**Diagnostic:**
+
+```bash
+df -i
+sudo find / -xdev -printf '%h\n' | sort | uniq -c | sort -rn | head -20
+```
+
+**Fix:** Remove cache directories with many small files:
+
+```bash
+pip cache purge
+sudo rm -rf /tmp/pip-*
+sudo find /var/log -name "*.gz" -delete
+```
+
+---
+
+## 5. Configuration Issues
+
+### 5.1 Wrong vCPU Count (96 vs 192 on p5.48xlarge)
+
+**Symptoms:**
+
+- `nproc` shows 96 instead of 192 on p5.48xlarge
+- Training uses fewer cores than expected
+
+**Root Cause:** Console "Advanced Configuration" defaults `ThreadsPerCore=1` (hyperthreading disabled).
+
+**Fix:**
+
+```bash
+aws sagemaker update-cluster --cluster-name <C> --region <R> \
+  --instance-groups '[{"InstanceGroupName":"<G>","InstanceType":"ml.p5.48xlarge",
+    "InstanceCount":<N>,"ThreadsPerCore":2,
+    "LifeCycleConfig":{"SourceS3Uri":"<URI>","OnCreate":"<SCRIPT>"},
+    "ExecutionRole":"<ROLE>"}]'
+```
+
+### 5.2 Lifecycle Script Environment Mismatch
+
+**Symptoms:**
+
+- Node joins cluster but software is wrong version
+- Environment variables not set after provisioning
+- Packages missing that should have been installed
+
+**Diagnostic:**
+
+```bash
+cat /var/log/provision/provisioning.log
+env | grep -E "FI_|NCCL_|CUDA_|LD_LIBRARY"
+```
+
+**Fix:** Review lifecycle scripts in S3, compare with latest upstream versions.
+
+---
+
+## 6. Network / Connectivity Issues
+
+### 6.1 Node Cannot Reach AWS APIs
+
+**Symptoms:**
+
+- `aws sagemaker describe-cluster` times out from node
+- SSM agent cannot register
+- CloudWatch agent cannot publish logs
+
+**Root Cause:** Missing VPC endpoints or NAT gateway.
+
+**Diagnostic:**
+
+```bash
+bash scripts/check-vpc-config.sh --cluster <C> --region <R>
+# On node:
+curl -s --connect-timeout 5 https://sagemaker.<REGION>.amazonaws.com/ && echo OK || echo FAIL
+```
+
+**Fix:** Add required VPC endpoints (S3, SSM, SSM Messages, EC2 Messages, SageMaker API).
+
+### 6.2 Cross-Node SSH Fails
+
+**Symptoms:**
+
+- `ssh <other-node>` fails from within cluster
+- Slurm `srun -w <node>` hangs
+
+**Root Cause:** Security group blocks inter-node SSH, or SSH keys not distributed.
+
+**Diagnostic:**
+
+```bash
+# On node:
+ssh -o ConnectTimeout=5 <other-node-ip> hostname
+ping -c 3 <other-node-ip>
+```
+
+**Fix:** Ensure SG allows all traffic within itself (self-ref rules). Check `~/.ssh/authorized_keys` on target.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-efa-sg.sh b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-efa-sg.sh
new file mode 100755
index 00000000..495dc6a6
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-efa-sg.sh
@@ -0,0 +1,349 @@
+#!/usr/bin/env bash
+# check-efa-sg.sh
+#
+# Identify and diagnose EFA security group rules for a HyperPod cluster.
+# Automatically extracts the cluster's exact VPC, subnets, and security groups
+# from the cluster ARN — works correctly even in accounts with 1000s of resources.
+#
+# Usage (preferred — cluster-centric, auto-discovers resources):
+#   bash check-efa-sg.sh --cluster <cluster-name-or-arn> --region <region>
+#
+# Usage (direct SG mode — when SG is already known):
+#   bash check-efa-sg.sh --sg-id <sg-id> --region <region>
+#
+# Exit codes:
+#   0 — all required rules in place
+#   1 — one or more required rules missing
+
+set -euo pipefail
+
+for cmd in aws python3; do
+  command -v "$cmd" &>/dev/null || {
+    echo "ERROR: '$cmd' is required but not found. Install it and retry."
+    exit 1
+  }
+done
+
+CLUSTER=""
+SG_ID=""
+REGION="${AWS_DEFAULT_REGION:-us-east-1}"
+USE_COLOR=true
+
+usage() {
+  cat <<EOF
+Usage:
+  $0 --cluster <cluster-name-or-arn> --region <region> [--no-color]
+  $0 --sg-id   <sg-id>               --region <region> [--no-color]
+
+Read-only diagnostic for EFA-related security group rules on a HyperPod
+cluster. Reports inbound/outbound self-referencing rules and outbound internet
+egress. Each [FAIL] line includes a pointer of the form
+"→ references/node-diagnostics-detail.md § A (EFA / Security Group)".
+
+Options:
+  --cluster   Auto-discovers SGs, subnets, VPC from the cluster (preferred).
+  --sg-id     Check a specific security group directly.
+  --region    AWS region (default: \$AWS_DEFAULT_REGION or us-east-1).
+  --no-color  Disable ANSI colors.
+  -h, --help  Show this message.
+
+Exit codes:
+  0  All required rules present.
+  1  One or more required rules missing.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cluster)  CLUSTER="$2";    shift 2 ;;
+    --sg-id)    SG_ID="$2";      shift 2 ;;
+    --region)   REGION="$2";     shift 2 ;;
+    --no-color) USE_COLOR=false; shift ;;
+    -h|--help)  usage; exit 0 ;;
+    *) echo "Unknown argument: $1" >&2; usage >&2; exit 1 ;;
+  esac
+done
+
+if [[ -z "$CLUSTER" && -z "$SG_ID" ]]; then
+  usage >&2
+  exit 1
+fi
+
+# Mutually exclusive: --cluster auto-discovers SGs, --sg-id targets one specific SG.
+# Passing both was silently ignoring --sg-id — error instead so the caller notices.
+if [[ -n "$CLUSTER" && -n "$SG_ID" ]]; then
+  echo "ERROR: --cluster and --sg-id are mutually exclusive (pick one)" >&2
+  exit 2
+fi
+
+if [[ -n "$SG_ID" && ! "$SG_ID" =~ ^sg-[a-fA-F0-9]{8,17}$ ]]; then
+  echo "ERROR: Invalid security group ID format: '$SG_ID' (expected sg-<hex>, e.g. sg-0abc1234def56789a)"
+  exit 1
+fi
+
+if ! [ -t 1 ] || [ "${TERM:-}" = "dumb" ]; then
+  USE_COLOR=false
+fi
+if "$USE_COLOR"; then
+  RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+  BOLD='\033[1m'; NC='\033[0m'
+else
+  RED=''; GREEN=''; YELLOW=''; BOLD=''; NC=''
+fi
+
+check_single_sg() {
+  local sg_id="$1"
+  local region="$2"
+  local issues=0
+
+  echo ""
+  echo -e "${BOLD}=== EFA Security Group Diagnostic ===${NC}"
+  echo -e "Security Group: ${BOLD}${sg_id}${NC}  Region: ${BOLD}${region}${NC}"
+  echo ""
+
+  local sg_json
+  sg_json=$(aws ec2 describe-security-groups \
+    --group-ids "$sg_id" \
+    --region "$region" \
+    --cli-read-timeout 30 \
+    --output json 2>&1) || {
+    echo -e "${RED}ERROR: Cannot describe security group '$sg_id' in region '$region'${NC}"
+    echo "$sg_json"
+    return 1
+  }
+
+  # Distinguish "API succeeded but returned empty" (auth-denied or malformed JSON
+  # still yielding exit 0) from "SG genuinely has no rules". Without this, the
+  # three rule checks below would each emit [FAIL], misleading the customer
+  # into thinking rules are missing when the check itself could not run.
+  local sg_count
+  sg_count=$(echo "$sg_json" | python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d.get('SecurityGroups',[])))" 2>/dev/null || echo 0)
+  if [[ "$sg_count" -eq 0 ]]; then
+    echo -e "  ${YELLOW}[WARN]${NC} Unable to check SG rules — describe-security-groups returned no data for '$sg_id' (possible IAM denial or stale ID)"
+    echo -e "         → references/node-diagnostics-detail.md § A (EFA / Security Group)"
+    return 0
+  fi
+
+  local sg_name vpc_id
+  sg_name=$(echo "$sg_json" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['SecurityGroups'][0].get('GroupName','unknown'))" 2>/dev/null || echo "unknown")
+  vpc_id=$(echo "$sg_json"  | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['SecurityGroups'][0].get('VpcId','unknown'))"   2>/dev/null || echo "unknown")
+  echo -e "Name: ${sg_name}  |  VPC: ${vpc_id}"
+  echo ""
+
+  echo -e "${BOLD}--- Inbound Rules ---${NC}"
+  echo "$sg_json" | python3 -c "
+import sys, json
+d = json.load(sys.stdin)['SecurityGroups'][0]
+rules = d.get('IpPermissions', [])
+if not rules:
+    print('  (none)')
+for r in rules:
+    proto = r.get('IpProtocol', '?')
+    srcs  = [g.get('GroupId','') for g in r.get('UserIdGroupPairs', [])]
+    cidrs = [c.get('CidrIp','') for c in r.get('IpRanges', [])]
+    for s in srcs:  print(f'  proto={proto} source=sg:{s}')
+    for c in cidrs: print(f'  proto={proto} source={c}')
+" 2>/dev/null
+
+  echo ""
+  echo -e "${BOLD}--- Outbound Rules ---${NC}"
+  echo "$sg_json" | python3 -c "
+import sys, json
+d = json.load(sys.stdin)['SecurityGroups'][0]
+rules = d.get('IpPermissionsEgress', [])
+if not rules:
+    print('  (none)')
+for r in rules:
+    proto = r.get('IpProtocol', '?')
+    dests = [g.get('GroupId','') for g in r.get('UserIdGroupPairs', [])]
+    cidrs = [c.get('CidrIp','') for c in r.get('IpRanges', [])]
+    for s in dests: print(f'  proto={proto} dest=sg:{s}')
+    for c in cidrs: print(f'  proto={proto} dest={c}')
+" 2>/dev/null
+
+  echo ""
+  echo -e "${BOLD}--- Rule Check Results ---${NC}"
+
+  local inbound_self outbound_self outbound_inet
+  inbound_self=$(echo "$sg_json" | SG_CHECK_ID="$sg_id" python3 -c "
+import sys, json, os
+sg=os.environ['SG_CHECK_ID']
+d = json.load(sys.stdin)['SecurityGroups'][0]
+for r in d.get('IpPermissions', []):
+    if r.get('IpProtocol') == '-1':
+        if any(g.get('GroupId') == sg for g in r.get('UserIdGroupPairs', [])):
+            print('found'); exit(0)
+" 2>/dev/null || echo "")
+
+  outbound_self=$(echo "$sg_json" | SG_CHECK_ID="$sg_id" python3 -c "
+import sys, json, os
+sg=os.environ['SG_CHECK_ID']
+d = json.load(sys.stdin)['SecurityGroups'][0]
+for r in d.get('IpPermissionsEgress', []):
+    if r.get('IpProtocol') == '-1':
+        if any(g.get('GroupId') == sg for g in r.get('UserIdGroupPairs', [])):
+            print('found'); exit(0)
+" 2>/dev/null || echo "")
+
+  outbound_inet=$(echo "$sg_json" | python3 -c "
+import sys, json
+d = json.load(sys.stdin)['SecurityGroups'][0]
+for r in d.get('IpPermissionsEgress', []):
+    if r.get('IpProtocol') == '-1':
+        if any(c.get('CidrIp') == '0.0.0.0/0' for c in r.get('IpRanges', [])):
+            print('found'); exit(0)
+" 2>/dev/null || echo "")
+
+  if [[ "$inbound_self" == "found" ]]; then
+    echo -e "  ${GREEN}[PASS]${NC} Inbound self-referencing rule (all traffic from ${sg_id})"
+  else
+    echo -e "  ${RED}[FAIL]${NC} Missing inbound self-referencing rule (all traffic from ${sg_id})"
+    issues=$((issues+1))
+  fi
+
+  if [[ "$outbound_self" == "found" ]]; then
+    echo -e "  ${GREEN}[PASS]${NC} Outbound self-referencing rule (all traffic to ${sg_id}) ← required for EFA"
+  else
+    echo -e "  ${RED}[FAIL]${NC} Missing outbound self-referencing rule ← ${BOLD}PRIMARY cause of EFA health check failure${NC}"
+    issues=$((issues+1))
+  fi
+
+  if [[ "$outbound_inet" == "found" ]]; then
+    echo -e "  ${GREEN}[PASS]${NC} Outbound internet access (0.0.0.0/0)"
+  else
+    echo -e "  ${YELLOW}[WARN]${NC} No outbound 0.0.0.0/0 rule — may break AWS API calls and package downloads"
+  fi
+
+  if [[ $issues -gt 0 ]]; then
+    echo ""
+    echo -e "  ${YELLOW}→ See references/node-diagnostics-detail.md § A (EFA / Security Group) for remediation.${NC}"
+  fi
+
+  return "$issues"
+}
+
+if [[ -n "$CLUSTER" ]]; then
+  echo ""
+  echo -e "${BOLD}=== HyperPod Cluster Resource Discovery ===${NC}"
+  echo -e "Cluster: ${BOLD}${CLUSTER}${NC}"
+  echo -e "Region:  ${BOLD}${REGION}${NC}"
+  echo ""
+
+  CLUSTER_JSON=$(aws sagemaker describe-cluster \
+    --cluster-name "$CLUSTER" \
+    --region "$REGION" \
+    --cli-read-timeout 30 \
+    --output json 2>&1) || {
+    echo -e "${RED}ERROR: Cannot find cluster '$CLUSTER' in region '$REGION'${NC}"
+    echo ""
+    echo "Available clusters in this region:"
+    aws sagemaker list-clusters --region "$REGION" \
+      --query 'ClusterSummaries[*].{Name:ClusterName,Status:ClusterStatus,ARN:ClusterArn}' \
+      --output table 2>/dev/null || echo "  (unable to list clusters)"
+    echo "$CLUSTER_JSON"
+    exit 1
+  }
+
+  CLUSTER_ARN=$(echo "$CLUSTER_JSON"    | python3 -c "import sys,json; print(json.load(sys.stdin).get('ClusterArn',''))"    2>/dev/null || echo "")
+  CLUSTER_STATUS=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ClusterStatus',''))" 2>/dev/null || echo "")
+  ORCHESTRATOR=$(echo "$CLUSTER_JSON"   | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+print('EKS' if 'Eks' in d.get('Orchestrator',{}) else 'Slurm')
+" 2>/dev/null || echo "Unknown")
+
+  echo -e "  ARN:          ${CLUSTER_ARN}"
+  echo -e "  Status:       ${CLUSTER_STATUS}"
+  echo -e "  Orchestrator: ${ORCHESTRATOR}"
+
+  RESOURCES=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+vpc=d.get('VpcConfig',{})
+sgs=vpc.get('SecurityGroupIds',[])
+subnets=vpc.get('Subnets',[])
+print('SGs='     + ','.join(sgs))
+print('Subnets=' + ','.join(subnets))
+" 2>/dev/null || echo "")
+
+  CLUSTER_SGS=$(echo "$RESOURCES"     | grep "^SGs="     | cut -d= -f2)
+  CLUSTER_SUBNETS=$(echo "$RESOURCES" | grep "^Subnets=" | cut -d= -f2)
+
+  if [[ -z "$CLUSTER_SGS" ]]; then
+    echo -e "${YELLOW}[WARN]${NC} No SecurityGroupIds in cluster VpcConfig — cluster may not have customer VPC"
+    exit 0
+  fi
+
+  VPC_ID="unknown"
+  if [[ -n "$CLUSTER_SUBNETS" ]]; then
+    FIRST_SUBNET=$(echo "$CLUSTER_SUBNETS" | tr ',' '\n' | head -1)
+    VPC_ID=$(aws ec2 describe-subnets \
+      --subnet-ids "$FIRST_SUBNET" \
+      --region "$REGION" \
+      --query 'Subnets[0].VpcId' \
+      --output text 2>/dev/null || echo "unknown")
+  fi
+
+  echo ""
+  echo -e "${BOLD}  Resources owned by cluster '${CLUSTER}':${NC}"
+  echo -e "  VPC:              ${VPC_ID}"
+  echo -e "  Security Groups:  ${CLUSTER_SGS}"
+  echo -e "  Subnets:          ${CLUSTER_SUBNETS}"
+
+  if [[ -n "$CLUSTER_SUBNETS" ]]; then
+    echo ""
+    echo -e "${BOLD}  Subnet details:${NC}"
+    IFS=',' read -ra _subnet_arr <<< "$CLUSTER_SUBNETS"
+    aws ec2 describe-subnets \
+      --subnet-ids "${_subnet_arr[@]}" \
+      --region "$REGION" \
+      --query 'Subnets[*].{SubnetId:SubnetId,AZ:AvailabilityZone,FreeIPs:AvailableIpAddressCount,VpcId:VpcId}' \
+      --output table 2>/dev/null || echo "  (unable to describe subnets)"
+  fi
+
+  echo ""
+  TOTAL_ISSUES=0
+  # CLUSTER_SGS is guaranteed non-empty at the -z guard above, but defend anyway.
+  # grep -c returns exit 1 on zero matches under pipefail, so suppress and then
+  # explicitly branch on the count rather than letting 0 silently fall through.
+  SG_COUNT=$(echo "$CLUSTER_SGS" | tr ',' '\n' | grep -c . || true)
+  if [[ "${SG_COUNT:-0}" -eq 0 ]]; then
+    echo -e "  ${YELLOW}[WARN]${NC} No security groups resolved from CLUSTER_SGS — cannot run EFA rule check"
+    echo -e "         → references/node-diagnostics-detail.md § A (EFA / Security Group)"
+    exit 0
+  fi
+  echo -e "${BOLD}Checking ${SG_COUNT} security group(s) for cluster '${CLUSTER}'...${NC}"
+
+  for SG in $(echo "$CLUSTER_SGS" | tr ',' ' '); do
+    echo ""
+    echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    # Capture rc in a subshell pattern that survives `set -e` — otherwise
+    # the first SG with issues aborts the loop and later SGs are never checked.
+    sg_rc=0
+    check_single_sg "$SG" "$REGION" || sg_rc=$?
+    TOTAL_ISSUES=$((TOTAL_ISSUES + sg_rc))
+  done
+
+  echo ""
+  echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+  if [[ $TOTAL_ISSUES -gt 0 ]]; then
+    echo -e "${RED}${BOLD}RESULT: ${TOTAL_ISSUES} security group rule issue(s) found for cluster '${CLUSTER}'${NC}"
+    echo "Fix the [FAIL] rules above, then retry cluster creation."
+    echo ""
+    echo "Verify after fixing:"
+    echo "  bash check-efa-sg.sh --cluster ${CLUSTER} --region ${REGION}"
+    exit 1
+  else
+    echo -e "${GREEN}${BOLD}RESULT: All EFA security group rules correctly configured for cluster '${CLUSTER}'${NC}"
+    echo ""
+    echo "If EFA health checks still fail:"
+    echo "  1. Verify all instance groups use one of these SGs: ${CLUSTER_SGS}"
+    echo "  2. Run check-node-reachability.sh on affected nodes via hyperpod-ssm skill"
+    exit 0
+  fi
+fi
+
+if [[ -n "$SG_ID" ]]; then
+  check_single_sg "$SG_ID" "$REGION"
+  exit $?
+fi
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-node-reachability.sh b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-node-reachability.sh
new file mode 100755
index 00000000..0a1ea4f9
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-node-reachability.sh
@@ -0,0 +1,400 @@
+#!/usr/bin/env bash
+# check-node-reachability.sh
+#
+# Diagnose EFA reachability and inter-node communication health on a single
+# HyperPod node. Run this on each node via the hyperpod-ssm skill.
+#
+# Usage (via ssm-exec.sh):
+#   ssm-exec.sh --target <TARGET> --upload scripts/check-node-reachability.sh /tmp/check-node-reachability.sh
+#   ssm-exec.sh --target <TARGET> 'bash /tmp/check-node-reachability.sh'
+#
+# Usage (direct on node):
+#   bash check-node-reachability.sh [--json] [--no-color]
+#
+# Exit codes:
+#   0 — all critical checks passed
+#   1 — one or more critical checks failed
+
+set -euo pipefail
+
+# Note: this script runs ON the node (via SSM), so aws CLI may not be present.
+# Only python3 is checked here; other tools are checked individually per section.
+
+JSON_MODE=false
+USE_COLOR=true
+
+usage() {
+  cat <<EOF
+Usage: bash check-node-reachability.sh [--json] [--no-color]
+
+Read-only on-node diagnostic for EFA reachability and inter-node communication
+health. Must be executed on a HyperPod compute node (typically via the
+hyperpod-ssm skill). Checks EFA interfaces, /dev/infiniband devices, GPU
+count and Neuron device count against the expected counts for the node's
+instance type.
+
+Options:
+  --json       Emit findings as JSON instead of human-readable output.
+  --no-color   Disable ANSI colors.
+  -h, --help   Show this message.
+
+Exit codes:
+  0  All critical checks passed.
+  1  One or more critical checks failed.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --json)     JSON_MODE=true;  shift ;;
+    --no-color) USE_COLOR=false; shift ;;
+    -h|--help)  usage; exit 0 ;;
+    *) echo "Unknown argument: $1" >&2; usage >&2; exit 1 ;;
+  esac
+done
+
+# Colors — auto-disable when stdout isn't a TTY.
+if ! [ -t 1 ] || [ "${TERM:-}" = "dumb" ]; then
+  USE_COLOR=false
+fi
+if "$USE_COLOR"; then
+  RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+  BOLD='\033[1m';   NC='\033[0m'
+else
+  RED=''; GREEN=''; YELLOW=''; BOLD=''; NC=''
+fi
+
+MIN_EFA_INSTALLER_MAJOR_VERSION=20   # EFA installer 1.20+ recommended for NCCL/OFI
+
+HOSTNAME=$(hostname 2>/dev/null || echo "unknown")
+TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+CRITICAL_FAILURES=0
+declare -A RESULTS   # associative array: check_name → pass|fail|warn|skip
+
+pass()  { RESULTS["$1"]="pass";  [[ "$JSON_MODE" == false ]] && echo -e "  ${GREEN}[PASS]${NC}  $1${2:+ — $2}"; }
+fail()  { RESULTS["$1"]="fail";  CRITICAL_FAILURES=$((CRITICAL_FAILURES+1)); \
+           [[ "$JSON_MODE" == false ]] && echo -e "  ${RED}[FAIL]${NC}  $1${2:+ — $2}"; }
+warn()  { RESULTS["$1"]="warn";  [[ "$JSON_MODE" == false ]] && echo -e "  ${YELLOW}[WARN]${NC}  $1${2:+ — $2}"; }
+skip()  { RESULTS["$1"]="skip";  [[ "$JSON_MODE" == false ]] && echo -e "         [SKIP]  $1${2:+ — $2}"; }
+info()  { [[ "$JSON_MODE" == false ]] && echo -e "         $1"; }
+
+if [[ "$JSON_MODE" == false ]]; then
+  echo ""
+  echo -e "${BOLD}=== HyperPod Node EFA Reachability Check ===${NC}"
+  echo -e "Host:      ${BOLD}${HOSTNAME}${NC}"
+  echo -e "Timestamp: ${TIMESTAMP}"
+  echo ""
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo -e "${BOLD}--- EFA Kernel Module ---${NC}"; fi
+
+EFA_MODULE=$(lsmod 2>/dev/null | grep -E '^efa\b' | awk '{print $1}' || true)
+if [[ -n "$EFA_MODULE" ]]; then
+  EFA_MODULE_VER=$(modinfo efa 2>/dev/null | grep -E '^version:' | awk '{print $2}' || echo "unknown")
+  pass "efa_kernel_module" "loaded (version: ${EFA_MODULE_VER})"
+else
+  # Read-only invariant: detect only, never `sudo modprobe efa` — loading kernel
+  # modules mutates node state, which the hyperpod-ssm skill's approval flow owns.
+  fail "efa_kernel_module" "not loaded — see references/node-diagnostics-detail.md § A (EFA / Security Group)"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- EFA Devices ---${NC}"; fi
+
+# shellcheck disable=SC2010  # /dev/ entries are kernel-named, safe to ls|grep
+EFA_DEVICES=$(ls /dev/infiniband/ 2>/dev/null | grep -E 'rdma_cm|uverbs|efa' || true)
+
+if [[ -n "$EFA_DEVICES" ]]; then
+  pass "efa_devices_present" "found in /dev/infiniband/: $(echo "$EFA_DEVICES" | tr '\n' ' ')"
+else
+  fail "efa_devices_present" "/dev/infiniband/ is empty or missing — EFA hardware not detected"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- libfabric EFA Provider ---${NC}"; fi
+
+if command -v fi_info &>/dev/null; then
+  # If the previous section found no EFA hardware, fi_info failing is expected —
+  # don't emit [FAIL] on top of the hardware [FAIL], which would double-count and
+  # conflate "libfabric can't see EFA" with "node has no EFA at all".
+  if [[ -z "$EFA_DEVICES" ]]; then
+    skip "fi_info_efa_provider" "no EFA devices detected upstream — see efa_devices_present"
+  else
+    FI_EXIT=0
+    FI_OUTPUT=$(fi_info -p efa 2>&1) || FI_EXIT=$?
+    if echo "$FI_OUTPUT" | grep -q "provider: efa"; then
+      EFA_PROVIDER_COUNT=$(echo "$FI_OUTPUT" | { grep -c "provider: efa" 2>/dev/null; true; })
+      pass "fi_info_efa_provider" "EFA provider found (${EFA_PROVIDER_COUNT} endpoint(s))"
+      info "$(echo "$FI_OUTPUT" | grep -E 'provider:|fabric:|domain:|version:' | head -8 | sed 's/^/    /')"
+    else
+      fail "fi_info_efa_provider" "fi_info -p efa returned no EFA provider (exit code ${FI_EXIT}) — libfabric cannot enumerate EFA devices. See references/node-diagnostics-detail.md § A (EFA / Security Group)"
+      info "fi_info output: ${FI_OUTPUT:0:200}"
+    fi
+  fi
+else
+  warn "fi_info_efa_provider" "fi_info not found — install libfabric to run this check (fi_info comes with EFA installer)"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- EFA Network Interfaces ---${NC}"; fi
+
+# EFA interfaces typically appear as eth0/ens* for primary + rdmaX or efa* for EFA devices
+# EFA ifaces on p5/p5en use regular kernel names (ens*) — filter by driver via ethtool
+# rather than by name pattern (the old 'rdma|efa' name grep misses ens* on p5).
+EFA_IFACES=""
+if command -v ethtool &>/dev/null; then
+  while IFS= read -r iface; do
+    [[ -z "$iface" ]] && continue
+    DRIVER=$(ethtool -i "$iface" 2>/dev/null | awk -F': ' '/^driver:/{print $2}')
+    if [[ "$DRIVER" == "efa" ]]; then
+      EFA_IFACES+="${iface}"$'\n'
+    fi
+  done < <(ip -o link show 2>/dev/null | awk -F': ' '{print $2}' | awk -F'@' '{print $1}' | grep -v '^lo$')
+fi
+# Fallback to name-based detection for older kernels / containers without ethtool
+if [[ -z "$EFA_IFACES" ]]; then
+  EFA_IFACES=$(ip link show 2>/dev/null | grep -E 'rdma|efa' | awk -F': ' '{print $2}' | tr -d '@' || true)
+fi
+REGULAR_IFACES=$(ip link show 2>/dev/null | grep -E 'state UP' | awk -F': ' '{print $2}' | tr -d '@' || true)
+
+if [[ -n "$EFA_IFACES" ]]; then
+  pass "efa_interfaces_up" "EFA interfaces found: $(echo "$EFA_IFACES" | tr '\n' ' ')"
+  while IFS= read -r iface; do
+    [[ -z "$iface" ]] && continue
+    IP=$(ip addr show "$iface" 2>/dev/null | grep 'inet ' | awk '{print $2}' || true)
+    if [[ -n "$IP" ]]; then
+      info "  $iface → $IP"
+    else
+      warn "efa_interface_ip_${iface}" "interface $iface has no IP address — check DHCP/subnet config"
+    fi
+  done <<< "$EFA_IFACES"
+else
+  info "No EFA interfaces detected (by driver or name)"
+  if [[ -n "$REGULAR_IFACES" ]]; then
+    skip "efa_interfaces_up" "no separate EFA interface — primary interfaces: $(echo "$REGULAR_IFACES" | tr '\n' ' ' | head -c 80)"
+  else
+    warn "efa_interfaces_up" "no UP network interfaces found"
+  fi
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- EFA Installation ---${NC}"; fi
+
+EFA_VER_FILE="/opt/amazon/efa_installed_packages"
+if [[ -f "$EFA_VER_FILE" ]]; then
+  # Format is "EFA installer version: 1.30.0" — grab only the version token.
+  EFA_VER=$(grep -iE '^EFA installer version' "$EFA_VER_FILE" 2>/dev/null \
+              | head -1 \
+              | grep -oE '[0-9]+\.[0-9]+(\.[0-9]+)?' \
+              | head -1 || echo "")
+  if [[ -z "$EFA_VER" ]]; then
+    warn "efa_installer_present" "EFA installer file present but version line not parsed"
+  else
+    pass "efa_installer_present" "EFA installer version: ${EFA_VER}"
+    # EFA uses "1.X.Y" scheme — major=1, recommend minor >= MIN_EFA_INSTALLER_MAJOR_VERSION.
+    EFA_MAJOR=$(echo "$EFA_VER" | cut -d. -f1)
+    EFA_MINOR=$(echo "$EFA_VER" | cut -d. -f2)
+    if [[ "$EFA_MAJOR" =~ ^[0-9]+$ ]] && [[ "$EFA_MINOR" =~ ^[0-9]+$ ]]; then
+      if [[ "$EFA_MAJOR" -eq 1 ]] && [[ "$EFA_MINOR" -lt "$MIN_EFA_INSTALLER_MAJOR_VERSION" ]]; then
+        warn "efa_installer_version" "EFA installer v${EFA_VER} may be outdated — recommend 1.${MIN_EFA_INSTALLER_MAJOR_VERSION}+ for optimal NCCL/OFI performance"
+      fi
+    fi
+  fi
+else
+  warn "efa_installer_present" "EFA installer marker not found at ${EFA_VER_FILE} — EFA may not be installed via standard method"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- NCCL / OFI Configuration ---${NC}"; fi
+
+NCCL_VARS=("FI_PROVIDER" "FI_EFA_USE_DEVICE_RDMA" "NCCL_SOCKET_IFNAME" "NCCL_ALGO" "LD_LIBRARY_PATH")
+ANY_NCCL_SET=false
+for var in "${NCCL_VARS[@]}"; do
+  val="${!var:-}"
+  if [[ -n "$val" ]]; then
+    info "  ${var}=${val}"
+    ANY_NCCL_SET=true
+  fi
+done
+
+if "$ANY_NCCL_SET"; then
+  FI_PROVIDER_VAL="${FI_PROVIDER:-}"
+  if [[ -n "$FI_PROVIDER_VAL" && "$FI_PROVIDER_VAL" != "efa" ]]; then
+    warn "nccl_fi_provider" "FI_PROVIDER=${FI_PROVIDER_VAL} — for EFA workloads this should be 'efa'"
+  elif [[ "$FI_PROVIDER_VAL" == "efa" ]]; then
+    pass "nccl_fi_provider" "FI_PROVIDER=efa"
+  fi
+else
+  skip "nccl_env_vars" "no NCCL/OFI env vars set in current shell — may be set in job launcher environment"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- AWS OFI NCCL Plugin ---${NC}"; fi
+
+OFI_LIB=$(find /opt/amazon/efa /opt/aws-ofi-nccl /usr/local/lib /usr/lib \
+  -name "libnccl-net.so*" -o -name "aws-ofi-nccl.so*" 2>/dev/null | head -1 || true)
+
+if [[ -n "$OFI_LIB" ]]; then
+  pass "aws_ofi_nccl_plugin" "found: ${OFI_LIB}"
+else
+  if [[ -f "$EFA_VER_FILE" ]] && grep -q "ofi\|OFI" "$EFA_VER_FILE" 2>/dev/null; then
+    pass "aws_ofi_nccl_plugin" "referenced in ${EFA_VER_FILE}"
+  else
+    warn "aws_ofi_nccl_plugin" "libnccl-net.so not found — required for EFA-accelerated NCCL (distributed training)"
+  fi
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- Instance Metadata Reachability ---${NC}"; fi
+
+IMDS_TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
+  -H "X-aws-ec2-metadata-token-ttl-seconds: 60" --connect-timeout 3 2>/dev/null || true)
+
+if [[ -n "$IMDS_TOKEN" ]]; then
+  INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
+    http://169.254.169.254/latest/meta-data/instance-type --connect-timeout 3 2>/dev/null || echo "unknown")
+  LOCAL_IP=$(curl -s -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
+    http://169.254.169.254/latest/meta-data/local-ipv4 --connect-timeout 3 2>/dev/null || echo "unknown")
+  pass "imds_reachable" "instance-type=${INSTANCE_TYPE}, local-ipv4=${LOCAL_IP}"
+
+  # Check EFA capability — comprehensive list of known EFA-capable instance families
+  # See: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types
+  case "$INSTANCE_TYPE" in
+    p4de*|p4d*|p5en*|p5e*|p5*|p6*|trn1*|trn2*|inf2*|g5.48xlarge|g6e.48xlarge|g6.48xlarge|hpc6a*|hpc6id*|hpc7a*|hpc7g*|dl1*|dl2q*)
+      pass "efa_capable_instance" "${INSTANCE_TYPE} supports EFA" ;;
+    *)
+      # Dynamic fallback: query EC2 API if available (this runs on-node so aws may not be present)
+      if command -v aws &>/dev/null; then
+        EFA_CHECK=$(aws ec2 describe-instance-types \
+          --instance-types "${INSTANCE_TYPE}" \
+          --query 'InstanceTypes[0].NetworkInfo.EfaSupported' \
+          --output text 2>/dev/null || echo "unknown")
+        if [[ "$EFA_CHECK" == "True" ]]; then
+          pass "efa_capable_instance" "${INSTANCE_TYPE} supports EFA (verified via API)"
+        elif [[ "$EFA_CHECK" == "False" ]]; then
+          warn "efa_capable_instance" "${INSTANCE_TYPE} does NOT support EFA"
+        else
+          warn "efa_capable_instance" "${INSTANCE_TYPE} — could not verify EFA support"
+        fi
+      else
+        warn "efa_capable_instance" "${INSTANCE_TYPE} — not in known EFA list; verify with: aws ec2 describe-instance-types --instance-types ${INSTANCE_TYPE} --query 'InstanceTypes[0].NetworkInfo.EfaSupported'"
+      fi
+      ;;
+  esac
+  # Multi-EFA validation — counts verified against AWS EC2 Instance Types docs.
+  # NOTE: EFA counts vary between instance families — p5en has fewer than p5/p5e.
+  # Source: https://docs.aws.amazon.com/ec2/latest/instancetypes/ac.html
+  EXPECTED_EFA=0
+  case "$INSTANCE_TYPE" in
+    p5.48xlarge|p5e.48xlarge)   EXPECTED_EFA=32 ;;
+    p5en.48xlarge)              EXPECTED_EFA=16 ;;
+    p4d.24xlarge|p4de.24xlarge) EXPECTED_EFA=4 ;;
+    trn1.32xlarge)              EXPECTED_EFA=8 ;;
+    trn2.48xlarge)              EXPECTED_EFA=16 ;;
+    # p6 family and newer: don't hardcode counts; discover via ethtool to avoid false FAILs.
+  esac
+
+  if [[ "$EXPECTED_EFA" -gt 0 ]]; then
+    # Count actual EFA devices — avoid grep -c pattern that returns "0\n0" fallthrough.
+    ACTUAL_EFA=$(find /dev/infiniband -maxdepth 1 -name 'uverbs*' 2>/dev/null | wc -l)
+    [[ -z "$ACTUAL_EFA" ]] && ACTUAL_EFA=0
+    if [[ "$ACTUAL_EFA" -ge "$EXPECTED_EFA" ]]; then
+      pass "multi_efa_interfaces" "${ACTUAL_EFA}/${EXPECTED_EFA} EFA interfaces present for ${INSTANCE_TYPE}"
+    elif [[ "$ACTUAL_EFA" -gt 0 ]]; then
+      warn "multi_efa_interfaces" "only ${ACTUAL_EFA}/${EXPECTED_EFA} EFA interfaces — some may not be attached or driver issue"
+    else
+      fail "multi_efa_interfaces" "0/${EXPECTED_EFA} EFA interfaces on ${INSTANCE_TYPE} — EFA driver or attachment issue"
+    fi
+  fi
+else
+  warn "imds_reachable" "IMDS not reachable — check security group allows outbound to 169.254.169.254"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- Network Interface Statistics ---${NC}"; fi
+
+if command -v ip &>/dev/null; then
+  IFACE_ERRORS=$(ip -s link show 2>/dev/null | awk '
+    BEGIN { rx_err=0; tx_err=0; iface="" }
+    /^[0-9]+:/ {
+      if (iface != "" && (rx_err > 0 || tx_err > 0))
+        print "  " iface ": RX errors=" rx_err " TX errors=" tx_err
+      iface=$2; gsub(/:$/, "", iface)
+      rx_err=0; tx_err=0
+    }
+    /RX:/ { getline; rx_err=$3+0 }
+    /TX:/ { getline; tx_err=$3+0 }
+    END {
+      if (iface != "" && (rx_err > 0 || tx_err > 0))
+        print "  " iface ": RX errors=" rx_err " TX errors=" tx_err
+    }
+  ' || true)
+
+  if [[ -n "$IFACE_ERRORS" ]]; then
+    warn "network_interface_errors" "interfaces with errors detected:"
+    info "$IFACE_ERRORS"
+  else
+    pass "network_interface_errors" "no RX/TX errors on active interfaces"
+  fi
+else
+  skip "network_interface_errors" "ip command not available"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- Neuron Devices (Trainium/Inferentia) ---${NC}"; fi
+
+if command -v neuron-ls &>/dev/null; then
+  NEURON_OUTPUT=$(neuron-ls 2>&1 || true)
+  NEURON_DEVICE_COUNT=$(echo "$NEURON_OUTPUT" | { grep -c "neuron_device" 2>/dev/null; true; })
+  if [[ "$NEURON_DEVICE_COUNT" -gt 0 ]]; then
+    pass "neuron_devices" "${NEURON_DEVICE_COUNT} Neuron device(s) detected"
+    info "$(echo "$NEURON_OUTPUT" | head -10 | sed 's/^/    /')"
+  else
+    NEURON_MOD=$(lsmod 2>/dev/null | grep -E '^neuron' || true)
+    if [[ -n "$NEURON_MOD" ]]; then
+      warn "neuron_devices" "Neuron driver loaded but neuron-ls shows 0 devices → references/node-diagnostics-detail.md § G.2 (Trainium/Inferentia)"
+    else
+      fail "neuron_devices" "Neuron driver not loaded → references/node-diagnostics-detail.md § G.2 (Trainium/Inferentia)"
+    fi
+  fi
+elif ls /dev/neuron* &>/dev/null 2>&1; then
+  NEURON_DEV_COUNT=$(find /dev -maxdepth 1 -name 'neuron*' 2>/dev/null | wc -l)
+  NEURON_DEV_COUNT=${NEURON_DEV_COUNT:-0}
+  warn "neuron_devices" "${NEURON_DEV_COUNT} /dev/neuron* device(s) found but neuron-ls not installed → references/node-diagnostics-detail.md § G.2 (Trainium/Inferentia)"
+else
+  skip "neuron_devices" "not a Trainium/Inferentia instance (no Neuron devices)"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then
+  echo ""
+  echo -e "${BOLD}--- Summary ---${NC}"
+  TOTAL=${#RESULTS[@]}
+  PASSED=$(printf '%s\n' "${RESULTS[@]}" | { grep -c "^pass$" 2>/dev/null; true; })
+  WARNED=$(printf '%s\n' "${RESULTS[@]}" | { grep -c "^warn$" 2>/dev/null; true; })
+  FAILED=$(printf '%s\n' "${RESULTS[@]}" | { grep -c "^fail$" 2>/dev/null; true; })
+  SKIPPED=$(printf '%s\n' "${RESULTS[@]}" | { grep -c "^skip$" 2>/dev/null; true; })
+  echo -e "  Host: ${HOSTNAME}"
+  echo -e "  Checks: ${TOTAL} total | ${GREEN}${PASSED} passed${NC} | ${YELLOW}${WARNED} warnings${NC} | ${RED}${FAILED} failed${NC} | ${SKIPPED} skipped"
+
+  if [[ $CRITICAL_FAILURES -eq 0 ]]; then
+    echo -e "\n  ${GREEN}${BOLD}Node EFA reachability checks PASSED.${NC}"
+    echo "  If inter-node communication still fails, verify security group rules with check-efa-sg.sh"
+    echo "  and compare EFA versions across nodes with the hyperpod-version-checker skill."
+  else
+    echo -e "\n  ${RED}${BOLD}Node EFA reachability checks FAILED (${CRITICAL_FAILURES} critical issue(s)).${NC}"
+    echo "  See [FAIL] items above. Each finding ends with a pointer of the form"
+    echo "  '→ references/node-diagnostics-detail.md § <section>' — open that section"
+    echo "  for root cause and remediation. Remediation lives in references, not in scripts."
+  fi
+  echo ""
+else
+  CHECKS_JSON=""
+  for key in "${!RESULTS[@]}"; do
+    val="${RESULTS[$key]}"
+    CHECKS_JSON+="\"${key}\": \"${val}\","
+  done
+  CHECKS_JSON="${CHECKS_JSON%,}"  # remove trailing comma
+
+  cat <<EOF
+{
+  "hostname": "${HOSTNAME}",
+  "timestamp": "${TIMESTAMP}",
+  "critical_failures": ${CRITICAL_FAILURES},
+  "overall_pass": $([ $CRITICAL_FAILURES -eq 0 ] && echo true || echo false),
+  "checks": { ${CHECKS_JSON} }
+}
+EOF
+fi
+
+exit "$([[ $CRITICAL_FAILURES -eq 0 ]] && echo 0 || echo 1)"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-vpc-config.sh b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-vpc-config.sh
new file mode 100755
index 00000000..ffff6601
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-vpc-config.sh
@@ -0,0 +1,502 @@
+#!/usr/bin/env bash
+# check-vpc-config.sh
+#
+# Diagnose VPC, subnet, and EKS configuration for a HyperPod cluster.
+# Automatically extracts ALL resources (VPC, subnets, SGs) from the cluster —
+# no need to know resource IDs in advance, even in accounts with 1000s of resources.
+#
+# Checks: VPC alignment, subnet AZ, IP availability, ENI limits,
+#         EKS auth mode, HyperPod namespace, VPC endpoints.
+#
+# Usage (cluster-centric — preferred):
+#   bash check-vpc-config.sh --cluster <name-or-arn> --region <region>
+#   bash check-vpc-config.sh --cluster <name-or-arn> --region <region> --eks-name <eks-cluster>
+#
+# Exit codes:
+#   0 — all checks passed (warnings may still be present)
+#   1 — one or more critical checks failed
+
+set -euo pipefail
+
+for cmd in aws python3; do
+  command -v "$cmd" &>/dev/null || {
+    echo "ERROR: '$cmd' is required but not found. Install it and retry."
+    exit 1
+  }
+done
+
+CLUSTER=""
+REGION="${AWS_DEFAULT_REGION:-us-east-1}"
+EKS_NAME=""
+USE_COLOR=true
+
+usage() {
+  cat <<EOF
+Usage: $0 --cluster <name-or-arn> --region <region> [options]
+
+Read-only diagnostic for VPC / subnet / EKS configuration on a HyperPod
+cluster. Reports VPC alignment, subnet AZ, IP availability, ENI limits,
+EKS auth mode, HyperPod namespace presence, and VPC endpoint presence.
+Each [FAIL] line includes a pointer of the form
+"→ references/node-diagnostics-detail.md § B (VPC / Routing)".
+
+Options:
+  --cluster     HyperPod cluster name or ARN (required).
+  --region      AWS region (default: \$AWS_DEFAULT_REGION or us-east-1).
+  --eks-name    EKS cluster name if different from the HyperPod cluster name.
+  --no-color    Disable ANSI colors.
+  -h, --help    Show this message.
+
+Exit codes:
+  0  All checks passed (warnings may still be present).
+  1  One or more critical checks failed.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cluster)   CLUSTER="$2";   shift 2 ;;
+    --region)    REGION="$2";    shift 2 ;;
+    --eks-name)  EKS_NAME="$2";  shift 2 ;;
+    --no-color)  USE_COLOR=false; shift ;;
+    -h|--help)   usage; exit 0 ;;
+    *) echo "Unknown argument: $1" >&2; usage >&2; exit 1 ;;
+  esac
+done
+
+if [[ -z "$CLUSTER" ]]; then
+  usage >&2
+  exit 1
+fi
+
+if ! [ -t 1 ] || [ "${TERM:-}" = "dumb" ]; then
+  USE_COLOR=false
+fi
+if "$USE_COLOR"; then
+  RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+  BOLD='\033[1m'; NC='\033[0m'
+else
+  RED=''; GREEN=''; YELLOW=''; BOLD=''; NC=''
+fi
+
+ENI_QUOTA_CODE="L-DF5E4CA3"   # AWS Service Quotas code for "Network interfaces per Region"
+
+CRITICAL_FAILURES=0
+
+pass()  { echo -e "  ${GREEN}[PASS]${NC}  $1${2:+ — $2}"; }
+fail()  { CRITICAL_FAILURES=$((CRITICAL_FAILURES+1)); echo -e "  ${RED}[FAIL]${NC}  $1${2:+ — $2}"; }
+warn()  { echo -e "  ${YELLOW}[WARN]${NC}  $1${2:+ — $2}"; }
+info()  { echo -e "         $1"; }
+header(){ echo ""; echo -e "${BOLD}--- $1 ---${NC}"; }
+
+echo ""
+echo -e "${BOLD}=== HyperPod VPC Configuration Check ===${NC}"
+echo -e "Cluster: ${BOLD}${CLUSTER}${NC}"
+echo -e "Region:  ${BOLD}${REGION}${NC}"
+
+header "1. Cluster VPC Configuration"
+
+CLUSTER_JSON=$(aws sagemaker describe-cluster \
+  --cluster-name "$CLUSTER" \
+  --region "$REGION" \
+  --cli-read-timeout 30 \
+  --output json 2>&1) || {
+  echo -e "${RED}ERROR: Could not describe cluster '$CLUSTER' in region '$REGION'${NC}"
+  echo "$CLUSTER_JSON"
+  exit 1
+}
+
+CLUSTER_STATUS=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('ClusterStatus','unknown'))" 2>/dev/null || echo "unknown")
+ORCHESTRATOR=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); o=d.get('Orchestrator',{}); print('EKS' if 'Eks' in o else 'Slurm')" 2>/dev/null || echo "unknown")
+NODE_RECOVERY=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('NodeRecovery','Unknown'))" 2>/dev/null || echo "Unknown")
+
+info "Status: $CLUSTER_STATUS | Orchestrator: $ORCHESTRATOR | NodeRecovery: $NODE_RECOVERY"
+
+SUBNET_IDS=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+subnets=d.get('VpcConfig',{}).get('Subnets',[])
+print(' '.join(subnets))
+" 2>/dev/null || echo "")
+
+SG_IDS=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+sgs=d.get('VpcConfig',{}).get('SecurityGroupIds',[])
+print(' '.join(sgs))
+" 2>/dev/null || echo "")
+
+if [[ -n "$SUBNET_IDS" ]]; then
+  pass "VpcConfig found"
+  info "Subnets: $SUBNET_IDS"
+  info "SecurityGroups: $SG_IDS"
+else
+  warn "VpcConfig" "no VpcConfig found in cluster"
+fi
+
+if [[ "$ORCHESTRATOR" == "EKS" && -z "$EKS_NAME" ]]; then
+  EKS_NAME=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+arn=d.get('Orchestrator',{}).get('Eks',{}).get('ClusterArn','')
+print(arn.split('/')[-1] if arn else '')
+" 2>/dev/null || echo "")
+fi
+
+if [[ -n "$SUBNET_IDS" ]]; then
+  header "2. Subnet VPC Alignment"
+
+  IFS=',' read -ra _subnet_arr <<< "$SUBNET_IDS"
+  SUBNET_JSON=$(aws ec2 describe-subnets \
+    --subnet-ids "${_subnet_arr[@]}" \
+    --region "$REGION" \
+    --cli-read-timeout 30 \
+    --output json 2>/dev/null || echo '{"Subnets":[]}')
+
+  VPC_IDS=$(echo "$SUBNET_JSON" | python3 -c "
+import sys,json
+subnets=json.load(sys.stdin).get('Subnets',[])
+vpc_ids=set(s.get('VpcId','?') for s in subnets)
+for s in subnets:
+    free=s.get('AvailableIpAddressCount',0)
+    az=s.get('AvailabilityZone','?')
+    sid=s.get('SubnetId','?')
+    vpc=s.get('VpcId','?')
+    flag='LOW IPs' if free < 10 else ''
+    print(f'  {sid}: VPC={vpc} AZ={az} FreeIPs={free} {flag}')
+print('VPCS=' + ','.join(vpc_ids))
+" 2>/dev/null || echo "")
+
+  echo "$VPC_IDS" | grep -v "^VPCS=" || true
+
+  UNIQUE_VPCS=$(echo "$VPC_IDS" | grep "^VPCS=" | cut -d= -f2 | tr ',' '\n' | sort -u | tr '\n' ',' | sed 's/,$//')
+  VPC_COUNT=$(echo "$UNIQUE_VPCS" | tr ',' '\n' | { grep -c . 2>/dev/null; true; })
+
+  if [[ "$VPC_COUNT" -gt 1 ]]; then
+    fail "Subnet VPC alignment" "Subnets are in DIFFERENT VPCs: $UNIQUE_VPCS — all must be in the same VPC → references/node-diagnostics-detail.md § B (VPC / Routing)"
+  elif [[ "$VPC_COUNT" -eq 1 ]]; then
+    pass "Subnet VPC alignment" "All subnets in VPC: $UNIQUE_VPCS"
+  else
+    # VPC_COUNT=0 means describe-subnets returned empty — usually an IAM denial
+    # on ec2:DescribeSubnets or a stale subnet ID. Without this branch the
+    # check would silently fall through and the customer sees no line at all.
+    warn "Subnet VPC alignment" "Unable to determine VPC — describe-subnets returned no data (check IAM ec2:DescribeSubnets) → references/node-diagnostics-detail.md § B (VPC / Routing)"
+  fi
+
+  if [[ -n "$SG_IDS" ]]; then
+    read -ra _sg_arr <<< "$SG_IDS"
+    SG_JSON=$(aws ec2 describe-security-groups \
+      --group-ids "${_sg_arr[@]}" \
+      --region "$REGION" \
+      --output json 2>/dev/null || echo '{"SecurityGroups":[]}')
+
+    SG_VPC_CHECK=$(echo "$SG_JSON" | SUBNET_VPC="$UNIQUE_VPCS" python3 -c "
+import sys, json, os
+sgs=json.load(sys.stdin).get('SecurityGroups',[])
+subnet_vpc=os.environ.get('SUBNET_VPC','')
+subnet_vpc_set=set(subnet_vpc.split(',')) if subnet_vpc else set()
+all_ok=True
+for sg in sgs:
+    sgid=sg.get('GroupId','?')
+    vpc=sg.get('VpcId','?')
+    if vpc not in subnet_vpc_set:
+        print(f'MISMATCH:{sgid} is in VPC {vpc} but subnets are in {subnet_vpc}')
+        all_ok=False
+    else:
+        print(f'OK:{sgid} in {vpc}')
+print('RESULT=' + ('PASS' if all_ok else 'FAIL'))
+" 2>/dev/null || echo "RESULT=SKIP")
+
+    echo "$SG_VPC_CHECK" | grep -v "^RESULT=" | sed 's/^OK:/  [OK]   SG /;s/^MISMATCH:/  [FAIL] SG /' || true
+    SG_RESULT=$(echo "$SG_VPC_CHECK" | grep "^RESULT=" | cut -d= -f2)
+    if [[ "$SG_RESULT" == "PASS" ]]; then
+      pass "SecurityGroup VPC alignment"
+    elif [[ "$SG_RESULT" == "FAIL" ]]; then
+      fail "SecurityGroup VPC alignment" "SG and subnet must be in the same VPC → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    else
+      # SG_RESULT is "SKIP" (json parse error) or empty (describe-security-groups
+      # returned nothing). Either way the check did not run — say so, don't
+      # leave the customer staring at a missing line.
+      warn "SecurityGroup VPC alignment" "Unable to verify — describe-security-groups returned no usable data (check IAM ec2:DescribeSecurityGroups) → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    fi
+  fi
+fi
+
+header "2a. VPC DNS Support & Hostnames"
+
+# HyperPod requires enableDnsSupport + enableDnsHostnames on the VPC so that
+# EKS service DNS and node internal hostnames resolve correctly.
+if [[ -n "$UNIQUE_VPCS" && "$UNIQUE_VPCS" != *,* ]]; then
+  DNS_SUPPORT=$(aws ec2 describe-vpc-attribute \
+    --vpc-id "$UNIQUE_VPCS" --attribute enableDnsSupport \
+    --region "$REGION" \
+    --query 'EnableDnsSupport.Value' --output text 2>/dev/null || echo "unknown")
+  DNS_HOSTNAMES=$(aws ec2 describe-vpc-attribute \
+    --vpc-id "$UNIQUE_VPCS" --attribute enableDnsHostnames \
+    --region "$REGION" \
+    --query 'EnableDnsHostnames.Value' --output text 2>/dev/null || echo "unknown")
+
+  if [[ "$DNS_SUPPORT" == "True" ]]; then
+    pass "VPC enableDnsSupport" "enabled"
+  else
+    fail "VPC enableDnsSupport" "must be True — EKS internal DNS and node hostname resolution will fail. → references/node-diagnostics-detail.md § B (VPC / Routing)"
+  fi
+  if [[ "$DNS_HOSTNAMES" == "True" ]]; then
+    pass "VPC enableDnsHostnames" "enabled"
+  else
+    fail "VPC enableDnsHostnames" "must be True — EKS internal DNS and node hostname resolution will fail. → references/node-diagnostics-detail.md § B (VPC / Routing)"
+  fi
+else
+  warn "VPC DNS attributes" "skipped — subnets span multiple VPCs or no VPC resolved"
+fi
+
+header "2b. Private Subnet / Routing"
+
+# HyperPod requires private subnets — a subnet is "public" if its route table has
+# a default route (0.0.0.0/0) pointing at an internet gateway. For outbound
+# access from a private subnet, the default route must point at a NAT gateway
+# (or be absent in a fully air-gapped VPC that relies on VPC endpoints).
+if [[ -n "$SUBNET_IDS" ]]; then
+  PRIVATE_CHECK=$(aws ec2 describe-route-tables \
+    --filters "Name=association.subnet-id,Values=$(echo "$SUBNET_IDS" | tr ' ' ',')" \
+    --region "$REGION" \
+    --query "RouteTables[*].{SubnetAssoc:Associations[?SubnetId!=\`null\`].SubnetId,Routes:Routes[?DestinationCidrBlock==\`0.0.0.0/0\`].{Target:GatewayId,NatGw:NatGatewayId}}" \
+    --output json 2>/dev/null || echo '[]')
+
+  echo "$PRIVATE_CHECK" | python3 -c "
+import sys, json
+rts = json.load(sys.stdin)
+if not rts:
+    print('INFO:no route tables associated — subnets likely use the main route table')
+    sys.exit(0)
+for rt in rts:
+    subs = rt.get('SubnetAssoc', []) or []
+    routes = rt.get('Routes', []) or []
+    for r in routes:
+        tgt = (r.get('Target') or '') or ''
+        nat = (r.get('NatGw') or '') or ''
+        subs_str = ','.join(subs) if subs else '(main)'
+        if tgt.startswith('igw-'):
+            print(f'FAIL:Subnet(s) {subs_str} route 0.0.0.0/0 -> Internet Gateway ({tgt}). HyperPod requires PRIVATE subnets; use a NAT gateway instead.')
+        elif nat.startswith('nat-'):
+            print(f'PASS:Subnet(s) {subs_str} route 0.0.0.0/0 -> NAT Gateway ({nat}) — private subnet, outbound via NAT.')
+        elif tgt.startswith('vpce-'):
+            print(f'INFO:Subnet(s) {subs_str} route 0.0.0.0/0 -> VPC endpoint ({tgt})')
+        else:
+            print(f'INFO:Subnet(s) {subs_str} route 0.0.0.0/0 -> {tgt or nat or \"unknown\"}')
+" 2>/dev/null | while IFS=: read -r level msg; do
+    case "$level" in
+      PASS) pass "Private subnet routing" "$msg" ;;
+      FAIL) fail "Private subnet routing" "$msg → references/node-diagnostics-detail.md § B (VPC / Routing)" ;;
+      WARN) warn "Private subnet routing" "$msg" ;;
+      INFO) info "$msg" ;;
+    esac
+  done
+fi
+
+header "3. IP Address Availability"
+
+if [[ -n "$SUBNET_IDS" ]]; then
+  _IP_CHECK=$(echo "$SUBNET_JSON" | python3 -c "
+import sys,json
+subnets=json.load(sys.stdin).get('Subnets',[])
+for s in subnets:
+    free=s.get('AvailableIpAddressCount',0)
+    sid=s.get('SubnetId','?')
+    if free < 5:
+        print(f'FAIL:{sid} only {free} free IPs — CRITICALLY LOW')
+    elif free < 50:
+        print(f'WARN:{sid} only {free} free IPs — consider expanding CIDR')
+    else:
+        print(f'PASS:{sid} has {free} free IPs')
+" 2>/dev/null || echo "")
+
+  while IFS= read -r line; do
+    [[ -z "$line" ]] && continue
+    level=$(echo "$line" | cut -d: -f1)
+    msg=$(echo "$line" | cut -d: -f2-)
+    case "$level" in
+      FAIL) fail "IP availability" "$msg → references/node-diagnostics-detail.md § B (VPC / Routing)" ;;
+      WARN) warn "IP availability" "$msg" ;;
+      PASS) pass "IP availability" "$msg" ;;
+    esac
+  done <<< "$_IP_CHECK"
+fi
+
+header "4. ENI Limits"
+
+if [[ -n "$UNIQUE_VPCS" ]]; then
+  VPC_ID=$(echo "$UNIQUE_VPCS" | tr ',' '\n' | head -1)
+  ENI_COUNT=$(aws ec2 describe-network-interfaces \
+    --filters "Name=vpc-id,Values=$VPC_ID" \
+    --region "$REGION" \
+    --query 'length(NetworkInterfaces)' \
+    --output text 2>/dev/null || echo "unknown")
+
+  ENI_QUOTA=$(aws service-quotas get-service-quota \
+    --service-code ec2 \
+    --quota-code "$ENI_QUOTA_CODE" \
+    --region "$REGION" \
+    --query 'Quota.Value' \
+    --output text 2>/dev/null || echo "unknown")
+
+  info "Current ENI count in VPC $VPC_ID: $ENI_COUNT"
+  info "ENI quota for region: $ENI_QUOTA"
+
+  if [[ "$ENI_COUNT" != "unknown" && "$ENI_QUOTA" != "unknown" ]]; then
+    USAGE_PCT=$(python3 -c "q=int(${ENI_QUOTA}); print(int(${ENI_COUNT}/q*100) if q > 0 else '?')" 2>/dev/null || echo "?")
+    if [[ "$USAGE_PCT" != "?" && "$USAGE_PCT" -gt 80 ]]; then
+      warn "ENI limits" "${USAGE_PCT}% of quota used — request increase via Service Quotas if provisioning fails → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    else
+      pass "ENI limits" "${ENI_COUNT}/${ENI_QUOTA} ENIs used (${USAGE_PCT}%)"
+    fi
+  else
+    warn "ENI limits" "Could not determine ENI usage — verify manually → references/node-diagnostics-detail.md § B (VPC / Routing)"
+  fi
+fi
+
+if [[ "$ORCHESTRATOR" == "EKS" && -n "$EKS_NAME" ]]; then
+  header "5. EKS Prerequisites"
+
+  EKS_DESC=$(aws eks describe-cluster \
+    --name "$EKS_NAME" \
+    --region "$REGION" \
+    --output json 2>/dev/null || echo '{}')
+
+  # VPC alignment — the EKS cluster's VPC must match the HyperPod cluster's VPC.
+  EKS_VPC=$(echo "$EKS_DESC" | python3 -c "import sys,json; print(json.load(sys.stdin).get('cluster',{}).get('resourcesVpcConfig',{}).get('vpcId',''))" 2>/dev/null || echo "")
+  if [[ -n "$EKS_VPC" && -n "$UNIQUE_VPCS" ]]; then
+    if [[ ",$UNIQUE_VPCS," == *",$EKS_VPC,"* ]]; then
+      pass "EKS VPC alignment" "EKS cluster in same VPC as HyperPod ($EKS_VPC)"
+    else
+      fail "EKS VPC alignment" "EKS cluster is in VPC $EKS_VPC but HyperPod subnets are in $UNIQUE_VPCS — they must match → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    fi
+  fi
+
+  # SG cross-reference — the HyperPod cluster SG must either be attached to the
+  # EKS cluster, OR the EKS cluster SG must allow inbound from the HyperPod SG.
+  EKS_SGS=$(echo "$EKS_DESC" | python3 -c "
+import sys,json
+d=json.load(sys.stdin).get('cluster',{}).get('resourcesVpcConfig',{})
+all_sgs = set(d.get('securityGroupIds',[]) or [])
+csg = d.get('clusterSecurityGroupId','')
+if csg: all_sgs.add(csg)
+print(' '.join(sorted(all_sgs)))
+" 2>/dev/null || echo "")
+
+  if [[ -n "$EKS_SGS" && -n "$SG_IDS" ]]; then
+    HP_SG_SET=$(echo "$SG_IDS" | tr ',' ' ')
+    SG_ATTACHED=false
+    for hp in $HP_SG_SET; do
+      for eks in $EKS_SGS; do
+        [[ "$hp" == "$eks" ]] && { SG_ATTACHED=true; break 2; }
+      done
+    done
+    if "$SG_ATTACHED"; then
+      pass "HyperPod SG on EKS" "HyperPod SG is attached to the EKS cluster"
+    else
+      EKS_SG_LIST=$(echo "$EKS_SGS" | tr ' ' ',' | sed 's/,$//')
+      read -r -a EKS_SG_ARR <<< "$EKS_SGS"
+      EKS_INGRESS=$(aws ec2 describe-security-groups \
+        --group-ids "${EKS_SG_ARR[@]}" \
+        --region "$REGION" --output json 2>/dev/null || echo '{"SecurityGroups":[]}')
+      CROSS_OK=$(echo "$EKS_INGRESS" | HP_SGS="$SG_IDS" python3 -c "
+import sys,json,os
+hp=set(os.environ.get('HP_SGS','').replace(',', ' ').split())
+sgs=json.load(sys.stdin).get('SecurityGroups',[])
+for sg in sgs:
+    for rule in sg.get('IpPermissions',[]):
+        for pair in rule.get('UserIdGroupPairs',[]):
+            if pair.get('GroupId','') in hp:
+                print('YES'); sys.exit(0)
+print('NO')
+" 2>/dev/null || echo "UNKNOWN")
+      if [[ "$CROSS_OK" == "YES" ]]; then
+        pass "HyperPod<->EKS SG" "EKS cluster SG ($EKS_SG_LIST) allows inbound from HyperPod SG"
+      else
+        fail "HyperPod<->EKS SG" "HyperPod SG is NOT attached to EKS and EKS SG ($EKS_SG_LIST) does not allow inbound from HyperPod SG → references/node-diagnostics-detail.md § A (EFA / Security Group)"
+      fi
+    fi
+  fi
+
+  EKS_AUTH=$(echo "$EKS_DESC" | python3 -c "import sys,json; print(json.load(sys.stdin).get('cluster',{}).get('accessConfig',{}).get('authenticationMode','unknown'))" 2>/dev/null || echo "unknown")
+
+  if [[ "$EKS_AUTH" == "CONFIG_MAP" ]]; then
+    fail "EKS auth mode" "CONFIG_MAP is not supported by HyperPod → references/node-diagnostics-detail.md § B (VPC / Routing)"
+  elif [[ "$EKS_AUTH" == "API" || "$EKS_AUTH" == "API_AND_CONFIG_MAP" ]]; then
+    pass "EKS auth mode" "$EKS_AUTH"
+  else
+    warn "EKS auth mode" "Could not determine ($EKS_AUTH) — verify manually"
+  fi
+
+  # EKS endpoint accessibility (reuses $EKS_DESC captured above).
+  PUB=$(echo "$EKS_DESC" | python3 -c "import sys,json; print(json.load(sys.stdin).get('cluster',{}).get('resourcesVpcConfig',{}).get('endpointPublicAccess',False))" 2>/dev/null || echo "false")
+  PRIV=$(echo "$EKS_DESC" | python3 -c "import sys,json; print(json.load(sys.stdin).get('cluster',{}).get('resourcesVpcConfig',{}).get('endpointPrivateAccess',False))" 2>/dev/null || echo "false")
+
+  info "EKS endpoint: public=$PUB, private=$PRIV"
+  if [[ "$PUB" == "False" && "$PRIV" == "True" ]]; then
+    warn "EKS endpoint" "Private-only endpoint — ensure worker subnets can reach EKS API (port 443), create EKS VPC endpoint if needed"
+  elif [[ "$PUB" == "True" ]]; then
+    pass "EKS endpoint" "Public access enabled"
+  fi
+
+  if command -v kubectl &>/dev/null; then
+    if kubectl get namespace aws-hyperpod &>/dev/null 2>&1; then
+      pass "aws-hyperpod namespace" "exists"
+    else
+      fail "aws-hyperpod namespace" "Missing → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    fi
+  else
+    warn "aws-hyperpod namespace" "kubectl not found — check skipped"
+  fi
+fi
+
+header "6. VPC Endpoints"
+
+if [[ -n "$UNIQUE_VPCS" ]]; then
+  VPC_ID=$(echo "$UNIQUE_VPCS" | tr ',' '\n' | head -1)
+  ENDPOINTS=$(aws ec2 describe-vpc-endpoints \
+    --filters "Name=vpc-id,Values=$VPC_ID" \
+    --region "$REGION" \
+    --query "VpcEndpoints[?State==\`available\`].ServiceName" \
+    --output text 2>/dev/null || echo "")
+
+  # Required for private/air-gapped VPCs. Port 443 is the default for every
+  # interface endpoint below; S3 uses a Gateway endpoint over the route table.
+  # FSx users additionally need com.amazonaws.<region>.fsx if using FSx on Lustre/OpenZFS.
+  REQUIRED_ENDPOINTS=("s3" "ecr.api" "ecr.dkr" "sts" "ssm" "ssmmessages" "ec2messages" "ec2" "sagemaker.api" "sagemaker.runtime" "logs")
+  for svc in "${REQUIRED_ENDPOINTS[@]}"; do
+    if echo "$ENDPOINTS" | grep -q "\.${svc}\$\|\.${svc}[^a-z]"; then
+      pass "VPC endpoint: $svc"
+    else
+      warn "VPC endpoint: $svc" "not found — required for internet-disabled (private) VPCs; skip if outbound 0.0.0.0/0 via NAT is available → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    fi
+  done
+
+  if [[ "$ORCHESTRATOR" == "EKS" ]]; then
+    if echo "$ENDPOINTS" | grep -q "\.eks\$\|\.eks[^a-z]"; then
+      pass "VPC endpoint: eks"
+    else
+      warn "VPC endpoint: eks" "not found — needed if EKS endpoint is private-only → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    fi
+  fi
+
+  if ! echo "$ENDPOINTS" | grep -q "\.fsx"; then
+    info "VPC endpoint: fsx — not present (only required if this cluster uses FSx for Lustre or OpenZFS in a private/air-gapped VPC)"
+  fi
+fi
+
+echo ""
+echo -e "${BOLD}--- Summary ---${NC}"
+
+if [[ $CRITICAL_FAILURES -eq 0 ]]; then
+  echo -e "  ${GREEN}${BOLD}VPC configuration checks PASSED (${CRITICAL_FAILURES} critical issues).${NC}"
+  echo "  If cluster creation still fails, check EFA security group rules:"
+  echo "  bash check-efa-sg.sh --sg-id <SG_ID> --region $REGION"
+else
+  echo -e "  ${RED}${BOLD}VPC configuration checks FAILED (${CRITICAL_FAILURES} critical issue(s)).${NC}"
+  echo "  Fix the [FAIL] items above and retry cluster creation."
+fi
+echo ""
+
+exit "$([[ $CRITICAL_FAILURES -eq 0 ]] && echo 0 || echo 1)"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/triage-cluster.sh b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/triage-cluster.sh
new file mode 100755
index 00000000..160092a4
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/triage-cluster.sh
@@ -0,0 +1,1223 @@
+#!/usr/bin/env bash
+# triage-cluster.sh — read-only HyperPod node triage.
+#
+# Collects signals to route node issues to the right reference section:
+#   - Cluster status, orchestrator, NodeRecovery
+#   - Cluster events (root-cause signal for provisioning failures)
+#   - Per-node health (HyperPod + EKS labels, Slurm state)
+#   - VPC / SG config
+#   - SSM reachability to compute nodes (hardware checks)
+#
+# Read-only: never modifies cluster state, never prints remediation commands.
+# Each [FAIL] / added issue carries a pointer of the form
+#   "... → references/node-diagnostics-detail.md § <section>"
+# which the hyperpod-node-debugger skill uses to look up remediation.
+#
+# Usage:
+#   bash triage-cluster.sh --cluster <name-or-arn> --region <region>
+#   bash triage-cluster.sh --cluster <name-or-arn> --region <region> --node <instance-id>
+#
+# Exit codes:
+#   0 — triage complete (issues may still exist — check output)
+#   1 — cluster not found or fatal error
+
+set -euo pipefail
+
+# Prerequisite tools. jq is a hard requirement per skill-creator safety rules.
+for cmd in aws jq python3; do
+  command -v "$cmd" &>/dev/null || {
+    echo "ERROR: '$cmd' is required but not found. Install it and retry."
+    exit 1
+  }
+done
+
+CLUSTER=""
+REGION="${AWS_DEFAULT_REGION:-us-east-1}"
+TARGET_NODE=""
+USE_COLOR=true
+
+usage() {
+  cat <<EOF
+Usage: $0 --cluster <name-or-arn> --region <region> [options]
+
+Options:
+  --cluster <name-or-arn>   HyperPod cluster name or ARN (required)
+  --region <region>         AWS region (default: \$AWS_DEFAULT_REGION or us-east-1)
+  --node <instance-id>      Focus on a single instance ID
+  --no-color                Disable ANSI colors
+  -h, --help                This message
+
+Read-only diagnostic. Every [FAIL] line carries a pointer like
+"→ references/node-diagnostics-detail.md § <section>" which the
+hyperpod-node-debugger skill uses to look up remediation.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cluster)  [[ $# -lt 2 ]] && { echo "ERROR: --cluster needs a value"; exit 2; }
+                [[ ! "$2" =~ ^(arn:aws[a-z-]*:sagemaker:[a-z0-9-]+:[0-9]{12}:cluster/[a-z0-9]{12}|[a-zA-Z0-9]([-a-zA-Z0-9]{0,62}))$ ]] && { echo "ERROR: --cluster must be a valid HyperPod cluster name or ARN (got '$2')"; exit 2; }
+                CLUSTER="$2"; shift 2 ;;
+    --region)   [[ $# -lt 2 ]] && { echo "ERROR: --region needs a value"; exit 2; }
+                [[ ! "$2" =~ ^[a-z]{2}-[a-z]+-[0-9]+$ ]] && { echo "ERROR: --region must be a valid AWS region (got '$2')"; exit 2; }
+                REGION="$2"; shift 2 ;;
+    --node)     [[ $# -lt 2 ]] && { echo "ERROR: --node needs a value"; exit 2; }
+                [[ ! "$2" =~ ^i-[0-9a-f]{8,17}$ ]] && { echo "ERROR: --node must be an EC2 instance ID (i-xxxxxxxx...)"; exit 2; }
+                TARGET_NODE="$2"; shift 2 ;;
+    --no-color) USE_COLOR=false; shift ;;
+    -h|--help)  usage; exit 0 ;;
+    *) echo "Unknown argument: $1"; usage; exit 2 ;;
+  esac
+done
+
+[[ -z "$CLUSTER" ]] && {
+  echo "Usage: $0 --cluster <name-or-arn> --region <region> [--node <instance-id>]"
+  exit 1
+}
+
+_CREDS=$(aws sts get-caller-identity --output json 2>&1) || {
+  echo "ERROR: AWS credentials not configured or expired."
+  echo "$_CREDS"
+  echo ""
+  echo "→ references/node-diagnostics-detail.md § K (Node Access via SSM) for credential setup"
+  exit 1
+}
+
+# Auto-disable colors when stdout is not a TTY (agent-piped / redirected).
+if ! [ -t 1 ] || [ "${TERM:-}" = "dumb" ]; then
+  USE_COLOR=false
+fi
+if "$USE_COLOR"; then
+  RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+  CYAN='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'
+else
+  RED=''; GREEN=''; YELLOW=''; CYAN=''; BOLD=''; NC=''
+fi
+
+section() { echo ""; echo -e "${BOLD}${CYAN}════════════════════════════════════════════════════════${NC}"; echo -e "${BOLD}${CYAN}  $1${NC}"; echo -e "${BOLD}${CYAN}════════════════════════════════════════════════════════${NC}"; }
+ok()      { echo -e "  ${GREEN}[PASS]${NC} $1"; }
+warn()    { echo -e "  ${YELLOW}[WARN]${NC} $1"; }
+bad()     { echo -e "  ${RED}[FAIL]${NC} $1"; }
+info()    { echo -e "  ${BOLD}[INFO]${NC} $1"; }
+
+ISSUES_FOUND=()
+add_issue() {
+  local priority="${2:-P1}"
+  ISSUES_FOUND+=("${priority}|$1")
+}
+
+aws_check_perms() {
+  local result="$1" api_name="$2"
+  if echo "$result" | grep -qiE "AccessDenied|UnauthorizedOperation|not authorized|AuthorizationError"; then
+    warn "Permission denied: $api_name — results may be incomplete"
+    add_issue "Missing IAM permission for $api_name → references/node-diagnostics-detail.md § K (Node Access via SSM)" "P1"
+    return 0
+  fi
+  return 1
+}
+
+_TEMP_FILES=()
+cleanup_temp() {
+  [[ ${#_TEMP_FILES[@]} -gt 0 ]] && rm -f "${_TEMP_FILES[@]}" 2>/dev/null || true
+}
+trap cleanup_temp EXIT
+
+# Run a shell command on a HyperPod node via SSM.
+#
+# HyperPod uses a SageMaker-managed instance fleet, so `aws ssm send-command`
+# with a bare instance-id is not supported. The supported path is
+# `aws ssm start-session` with target `sagemaker-cluster:<cluster-id>_<group>-<iid>`
+# and document `AWS-StartNonInteractiveCommand`.
+#
+# Usage: ssm_run_on_node <instance-id> <instance-group-name> "<shell command>"
+# Returns remote stdout. start-session does not propagate the remote exit code.
+ssm_run_on_node() {
+  local iid="$1" grp="$2" cmd="$3"
+  [[ -z "$iid" || -z "$grp" || -z "$cmd" ]] && return 1
+  [[ ! "$iid" =~ ^i-[0-9a-f]{8,17}$ ]] && return 1
+  [[ -z "${CLUSTER_ID:-}" ]] && return 1
+  [[ ! "$grp" =~ ^[A-Za-z0-9._-]+$ ]] && return 1
+
+  local target="sagemaker-cluster:${CLUSTER_ID}_${grp}-${iid}"
+  local tmp; tmp=$(mktemp 2>/dev/null) || return 1
+  chmod 600 "$tmp" 2>/dev/null || true
+  _TEMP_FILES+=("$tmp")
+  # Embed the command as base64 because AWS-StartNonInteractiveCommand
+  # collapses newlines in a single command element.
+  local cmd_b64
+  cmd_b64=$(printf '%s' "$cmd" | base64 | tr -d '\n') || return 1
+  local remote="bash -c \"echo $cmd_b64 | base64 -d | bash\""
+  python3 -c "import json,sys; print(json.dumps({'command':[sys.argv[1]]}))" "$remote" > "$tmp" || return 1
+
+  local attempt=0 out rc
+  while (( attempt < 5 )); do
+    out=$(timeout 180 aws ssm start-session \
+      --target "$target" \
+      --document-name AWS-StartNonInteractiveCommand \
+      --parameters "file://$tmp" \
+      --region "$REGION" 2>&1)
+    rc=$?
+    # SSM sometimes returns rc=0 with a transport error baked into stdout —
+    # retry those (EOF, SessionManagerPlugin not found, i/o timeout).
+    if (( rc == 0 )) && ! echo "$out" | grep -qiE "Cannot perform start session|EOF$|SessionManagerPlugin is not found|ERROR: Unable to|i/o timeout"; then
+      # Strip SSM session banners and the echoed base64 command line.
+      echo "$out" | grep -vE '^(Starting session with SessionId:|Exiting session with sessionId:|\s*$)' \
+                  | grep -vE "^(bash -c \"echo [A-Za-z0-9+/=]+ \| base64 -d \| bash\"|echo '[A-Za-z0-9+/=]+'|[A-Za-z0-9+/=]{40,}={0,2})[[:space:]]*\|?[[:space:]]*base64?[[:space:]]*-?d?[[:space:]]*\|?[[:space:]]*bash\"?\$" || true
+      return 0
+    fi
+    if echo "$out" | grep -qiE "ThrottlingException|RequestLimitExceeded|InternalFailure|InternalError|ServiceUnavailable|TooManyUpdates|Cannot perform start session|EOF$|SessionManagerPlugin is not found|i/o timeout"; then
+      attempt=$((attempt + 1))
+      sleep $((attempt * 3))
+      continue
+    fi
+    # Non-transient error; surface stderr so callers can diagnose.
+    echo "$out" >&2
+    return 1
+  done
+  return 1
+}
+
+echo ""
+echo -e "${CYAN}${BOLD}HyperPod Node Triage — READ-ONLY${NC}"
+echo -e "${CYAN}   No cluster state will be modified. Each issue line below includes a${NC}"
+echo -e "${CYAN}   pointer to references/node-diagnostics-detail.md for remediation.${NC}"
+
+section "1. Cluster Identity"
+
+CLUSTER_JSON=$(aws sagemaker describe-cluster \
+  --cluster-name "$CLUSTER" \
+  --region "$REGION" \
+  --cli-read-timeout 30 \
+  --output json 2>&1) || {
+  echo -e "${RED}ERROR: Cannot find cluster '$CLUSTER' in region '$REGION'${NC}"
+  echo ""
+  echo "Available clusters in $REGION:"
+  aws sagemaker list-clusters --region "$REGION" \
+    --query 'ClusterSummaries[*].{Name:ClusterName,Status:ClusterStatus,ARN:ClusterArn}' \
+    --output table 2>/dev/null || echo "  (unable to list)"
+  exit 1
+}
+
+CLUSTER_ARN=$(echo "$CLUSTER_JSON"    | python3 -c "import sys,json; print(json.load(sys.stdin).get('ClusterArn',''))"                2>/dev/null || echo "")
+CLUSTER_STATUS=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ClusterStatus',''))"             2>/dev/null || echo "")
+ORCHESTRATOR=$(echo "$CLUSTER_JSON"   | python3 -c "import sys,json; d=json.load(sys.stdin); print('EKS' if 'Eks' in d.get('Orchestrator',{}) else 'Slurm')" 2>/dev/null || echo "Unknown")
+NODE_RECOVERY=$(echo "$CLUSTER_JSON"  | python3 -c "import sys,json; print(json.load(sys.stdin).get('NodeRecovery','Unknown'))"       2>/dev/null || echo "Unknown")
+CLUSTER_ID=$(echo "$CLUSTER_ARN" | cut -d/ -f2 2>/dev/null || echo "")
+
+echo -e "  ARN:          ${CLUSTER_ARN}"
+echo -e "  Status:       ${BOLD}${CLUSTER_STATUS}${NC}"
+echo -e "  Orchestrator: ${ORCHESTRATOR}"
+echo -e "  NodeRecovery: ${NODE_RECOVERY}"
+echo -e "  ClusterId:    ${CLUSTER_ID}"
+
+[[ "$NODE_RECOVERY" == "None" || "$NODE_RECOVERY" == "Unknown" ]] && \
+  warn "NodeRecovery is '$NODE_RECOVERY' — auto-replacement disabled. Manual intervention required for hardware failures."
+
+section "2. Cluster Events (Root Cause Signals)"
+
+# Fetch multiple pages and merge into a single JSON blob. Cap at 500 events to
+# bound memory and runtime on long-lived clusters (each page is up to 100).
+fetch_cluster_events() {
+  local merged='[]' token='' page_json i=0
+  while (( i < 5 )); do
+    # Only pass --next-token if the token parses as a non-empty, strictly
+    # base64/URL-safe string. Sending garbage (e.g. an error message that
+    # leaked into $token) would cause ValidationException / BadRequest.
+    if [[ -n "$token" && "$token" =~ ^[a-zA-Z0-9/+]*={0,2}$ ]]; then
+      page_json=$(aws sagemaker list-cluster-events \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 --next-token "$token" \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    else
+      page_json=$(aws sagemaker list-cluster-events \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    fi
+    # Merge via stdin (NUL-delimited) so it survives arbitrarily large pages.
+    local combined
+    combined=$(printf '%s\0%s' "$merged" "$page_json" | python3 -c "
+import sys, json
+blob = sys.stdin.buffer.read()
+try:
+    a, b = blob.split(b'\0', 1)
+    merged = json.loads(a)
+    page = json.loads(b)
+except (json.JSONDecodeError, ValueError):
+    sys.exit(2)
+merged.extend(page.get('ClusterEventSummaries', []))
+print(json.dumps(merged))
+print(page.get('NextToken','') or '')
+" 2>/dev/null) || break
+    merged=$(printf '%s\n' "$combined" | sed -n '1p')
+    token=$(printf '%s\n'  "$combined" | sed -n '2p')
+    i=$((i+1))
+    [[ -z "$token" ]] && break
+  done
+  printf '%s' "$merged" | python3 -c "
+import sys, json
+try:
+    print(json.dumps({'ClusterEventSummaries': json.loads(sys.stdin.read())}))
+except json.JSONDecodeError:
+    print('{\"ClusterEventSummaries\":[]}')
+" 2>/dev/null || echo '{"ClusterEventSummaries":[]}'
+}
+
+EVENTS=$(fetch_cluster_events)
+if [[ -z "$EVENTS" ]] || echo "$EVENTS" | grep -qE "AccessDenied|not authorized"; then
+  aws_check_perms "$EVENTS" "sagemaker:ListClusterEvents"
+  EVENTS='{"ClusterEventSummaries":[]}'
+fi
+
+EVENT_COUNT=$(echo "$EVENTS" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('ClusterEventSummaries',[])))" 2>/dev/null || echo "0")
+
+if [[ "$EVENT_COUNT" -gt 0 ]]; then
+  echo -e "  Found ${BOLD}${EVENT_COUNT}${NC} cluster events. Recent events:"
+  echo ""
+
+  echo "$EVENTS" | python3 -c "
+import sys, json
+events = json.load(sys.stdin).get('ClusterEventSummaries', [])
+for e in events[:20]:
+    ts = e.get('Timestamp','?')
+    msg = e.get('Message','')
+    sev = e.get('Severity','INFO')
+    # Color-code by known error patterns
+    tag = ''
+    if 'EFA health checks did not run' in msg:
+        tag = ' ← [GO TO SECTION A: EFA/SG FIX]'
+    elif 'bootstrap failed' in msg and 'network' in msg.lower():
+        tag = ' ← [GO TO SECTION A+B: VPC/EKS FIX]'
+    elif 'Lifecycle scripts' in msg:
+        tag = ' ← [GO TO SECTION D: LIFECYCLE FIX]'
+    elif 'hardware failure' in msg.lower():
+        tag = ' ← [GO TO SECTION F: HARDWARE]'
+    elif 'Insufficient capacity' in msg or 'sufficient capacity' in msg:
+        tag = ' ← [GO TO SECTION C: CAPACITY]'
+    elif 'Failed to provision' in msg:
+        tag = ' ← [CHECK SECTION C or F]'
+    elif 'successfully' in msg.lower() and 'failed' not in msg.lower():
+        tag = ' [OK]'
+    print(f'  [{sev}] {ts}')
+    print(f'    {msg[:120]}{\"...\" if len(msg) > 120 else \"\"}{tag}')
+    print()
+" 2>/dev/null
+
+  FAILURE_EVENTS=$(echo "$EVENTS" | python3 -c "
+import sys,json
+events=json.load(sys.stdin).get('ClusterEventSummaries',[])
+fails=[e.get('Message','') for e in events if any(k in e.get('Message','').lower() for k in ['failed','error','timeout','fault','unhealthy'])]
+for f in fails[:5]:
+    print(f)
+" 2>/dev/null || echo "")
+
+  if echo "$FAILURE_EVENTS" | grep -qi "efa health"; then
+    add_issue "EFA health check failure → references/node-diagnostics-detail.md § A (EFA / Security Group)" "P0"
+  fi
+  if echo "$FAILURE_EVENTS" | grep -qi "network misconfiguration\|bootstrap failed"; then
+    add_issue "K8s bootstrap network error → references/node-diagnostics-detail.md § A (EFA / Security Group) + § B (VPC / Routing)" "P1"
+  fi
+  if echo "$FAILURE_EVENTS" | grep -qi "lifecycle script"; then
+    add_issue "Lifecycle script failure → references/node-diagnostics-detail.md § D (Lifecycle Scripts)" "P1"
+  fi
+  if echo "$FAILURE_EVENTS" | grep -qi "hardware failure"; then
+    add_issue "Hardware failure detected → references/node-diagnostics-detail.md § F (Hardware / Auto-Repair)" "P1"
+  fi
+  if echo "$FAILURE_EVENTS" | grep -qi "insufficient capacity"; then
+    add_issue "Insufficient capacity → references/node-diagnostics-detail.md § C (Capacity / AZ)" "P1"
+  fi
+else
+  warn "No cluster events available (may be Slurm cluster or no events yet)"
+fi
+
+section "3. Node Health Status"
+
+# Paginate list-cluster-nodes — default page is only 10 nodes, so large clusters
+# would otherwise be diagnosed on a tiny sample.
+fetch_all_cluster_nodes() {
+  local merged='[]' token='' page_json combined i=0
+  local max_pages=200  # 200 × 100 = 20 000 nodes, supports 7k+ clusters
+  while (( i < max_pages )); do
+    # Validate token format before sending — avoid BadRequest on garbage.
+    if [[ -n "$token" && "$token" =~ ^[a-zA-Z0-9/+]*={0,2}$ ]]; then
+      page_json=$(aws sagemaker list-cluster-nodes \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 --next-token "$token" \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    else
+      page_json=$(aws sagemaker list-cluster-nodes \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    fi
+    # Merge via stdin (NUL-delimited) to avoid ARG_MAX truncation at ~500 nodes.
+    combined=$(printf '%s\0%s' "$merged" "$page_json" | python3 -c "
+import sys, json
+blob = sys.stdin.buffer.read()
+try:
+    a, b = blob.split(b'\0', 1)
+    merged = json.loads(a)
+    page = json.loads(b)
+except (json.JSONDecodeError, ValueError):
+    sys.exit(2)
+merged.extend(page.get('ClusterNodeSummaries', []))
+print(json.dumps(merged))
+print(page.get('NextToken','') or '')
+" 2>/dev/null) || break
+    merged=$(printf '%s\n' "$combined" | sed -n '1p')
+    token=$(printf '%s\n'  "$combined" | sed -n '2p')
+    i=$((i+1))
+    [[ -z "$token" ]] && break
+  done
+  if (( i == max_pages )) && [[ -n "$token" ]]; then
+    echo "WARN: list-cluster-nodes truncated at ${max_pages} pages (~$((max_pages*100)) nodes). Diagnostic sample is incomplete for very large clusters." >&2
+  fi
+  # Final wrap via stdin — argv path would hit ARG_MAX at ~500 nodes.
+  printf '%s' "$merged" | python3 -c "
+import sys, json
+try:
+    print(json.dumps({'ClusterNodeSummaries': json.loads(sys.stdin.read())}))
+except json.JSONDecodeError:
+    print('{\"ClusterNodeSummaries\":[]}')
+" 2>/dev/null || echo '{"ClusterNodeSummaries":[]}'
+}
+
+NODES_JSON=$(fetch_all_cluster_nodes)
+if [[ -z "$NODES_JSON" ]] || echo "$NODES_JSON" | grep -qE "AccessDenied|not authorized"; then
+  aws_check_perms "$NODES_JSON" "sagemaker:ListClusterNodes"
+  NODES_JSON='{"ClusterNodeSummaries":[]}'
+fi
+
+TOTAL_NODES=$(echo "$NODES_JSON"   | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('ClusterNodeSummaries',[])))" 2>/dev/null || echo "0")
+RUNNING_NODES=$(echo "$NODES_JSON" | python3 -c "import sys,json; print(sum(1 for n in json.load(sys.stdin).get('ClusterNodeSummaries',[]) if n.get('InstanceStatus',{}).get('Status')=='Running'))" 2>/dev/null || echo "0")
+BAD_NODES=$(echo "$NODES_JSON"     | python3 -c "import sys,json; print(sum(1 for n in json.load(sys.stdin).get('ClusterNodeSummaries',[]) if n.get('InstanceStatus',{}).get('Status') not in ('Running','')))" 2>/dev/null || echo "0")
+
+echo -e "  Total: ${TOTAL_NODES}  Running: ${GREEN}${RUNNING_NODES}${NC}  Problems: ${RED}${BAD_NODES}${NC}"
+
+if [[ "$BAD_NODES" -gt 0 ]]; then
+  echo ""
+  echo -e "  ${RED}Non-Running nodes:${NC}"
+  echo "$NODES_JSON" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    status=n.get('InstanceStatus',{})
+    st=status.get('Status','?')
+    if st not in ('Running',''):
+        iid=n.get('InstanceId','?')
+        grp=n.get('InstanceGroupName','?')
+        itype=n.get('InstanceType','?')
+        msg=status.get('Message','')
+        print(f'  [FAIL] {iid} ({grp} / {itype})')
+        print(f'    Status: {st}')
+        if msg:
+            print(f'    Message: {msg[:100]}')
+        print()
+" 2>/dev/null
+  add_issue "$BAD_NODES node(s) not Running → references/node-diagnostics-detail.md § F (Hardware / Auto-Repair)" "P1"
+else
+  ok "All $TOTAL_NODES nodes are Running"
+fi
+
+if [[ -n "$TARGET_NODE" ]]; then
+  echo ""
+  echo -e "  ${BOLD}Targeted node: ${TARGET_NODE}${NC}"
+  NODE_DETAIL=$(aws sagemaker describe-cluster-node \
+    --cluster-name "$CLUSTER" \
+    --node-id "$TARGET_NODE" \
+    --region "$REGION" \
+    --cli-read-timeout 30 \
+    --output json 2>&1 || true)
+  if echo "$NODE_DETAIL" | grep -qiE "ResourceNotFound|not found|ValidationException"; then
+    bad "Node '$TARGET_NODE' not found in cluster '$CLUSTER'"
+    info "Verify the instance ID belongs to this cluster:"
+    info "  aws sagemaker list-cluster-nodes --cluster-name $CLUSTER --region $REGION --query 'ClusterNodeSummaries[*].InstanceId' --output text"
+    add_issue "Node $TARGET_NODE not found in cluster $CLUSTER → verify --cluster and --node arguments" "P0"
+    TARGET_NODE=""  # clear so downstream SSM probe doesn't retry on nonexistent node
+  elif echo "$NODE_DETAIL" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+    warn "Permission denied: sagemaker:DescribeClusterNode — check IAM policy"
+  else
+    echo "$NODE_DETAIL" | python3 -c "
+import sys,json
+try:
+    d=json.load(sys.stdin).get('NodeDetails',{})
+    st=d.get('InstanceStatus',{})
+    print(f'  Status: {st.get(\"Status\",\"?\")}')
+    print(f'  Launch: {d.get(\"LaunchTime\",\"?\")}')
+    print(f'  Message: {st.get(\"Message\",\"\")}')
+    print(f'  Type: {d.get(\"InstanceType\",\"?\")}')
+    print(f'  Group: {d.get(\"InstanceGroupName\",\"?\")}')
+except Exception:
+    pass
+" 2>/dev/null
+  fi
+fi
+
+if [[ "$ORCHESTRATOR" == "EKS" ]]; then
+  section "4. EKS Node Health Labels"
+
+  if command -v kubectl &>/dev/null; then
+    UNHEALTHY_LABELS=$(kubectl get nodes \
+      -l 'sagemaker.amazonaws.com/node-health-status notin (Schedulable)' \
+      -o custom-columns='NODE:.metadata.name,HEALTH:.metadata.labels.sagemaker\.amazonaws\.com/node-health-status,FAULT:.metadata.labels.sagemaker\.amazonaws\.com/fault-type,DHC:.metadata.labels.sagemaker\.amazonaws\.com/deep-health-check-status' \
+      --no-headers 2>/dev/null || echo "")
+
+    if [[ -n "$UNHEALTHY_LABELS" ]]; then
+      bad "Nodes with health issues:"
+      while IFS= read -r line; do
+        echo "    $line"
+        if echo "$line" | grep -q "PendingReplacement"; then
+          add_issue "Node pending replacement (UnschedulablePendingReplacement) → references/node-diagnostics-detail.md § F (Hardware / Auto-Repair)" "P1"
+        elif echo "$line" | grep -q "PendingReboot"; then
+          add_issue "Node pending reboot (UnschedulablePendingReboot) → references/node-diagnostics-detail.md § F (Hardware / Auto-Repair)" "P1"
+        fi
+      done <<< "$UNHEALTHY_LABELS"
+    else
+      ok "All EKS nodes have healthy labels (Schedulable)"
+    fi
+
+    # Check deep health check status. Under `set -o pipefail`, a failed kubectl
+    # with `| wc -l || echo 0` yields "0\n0". Count safely via a tmp var.
+    DHC_FAILED_OUT=$(kubectl get nodes \
+      -l 'sagemaker.amazonaws.com/deep-health-check-status=Failed' \
+      -o name 2>/dev/null || true)
+    DHC_FAILED=$(echo -n "$DHC_FAILED_OUT" | grep -c . || true)
+    [[ -z "$DHC_FAILED" ]] && DHC_FAILED=0
+    [[ "$DHC_FAILED" -gt 0 ]] && bad "$DHC_FAILED node(s) have deep-health-check-status=Failed → references/node-diagnostics-detail.md § G (GPU/Accelerator) + § F (Hardware / Auto-Repair)"
+  else
+    warn "kubectl not available — cannot check EKS node labels (install kubectl to enable this check)"
+  fi
+fi
+
+if [[ "$ORCHESTRATOR" == "EKS" ]] && command -v kubectl &>/dev/null; then
+  section "4a. EKS CNI & System Pod Health"
+
+  CNI_ISSUES=0
+  # aws-node (VPC CNI plugin) — if this crashes, no pods can get IPs
+  AWS_NODE_DS=$(kubectl get ds -n kube-system aws-node -o json 2>/dev/null || echo "")
+  if [[ -n "$AWS_NODE_DS" && "$AWS_NODE_DS" != "" ]]; then
+    AWS_NODE_STATUS=$(echo "$AWS_NODE_DS" | python3 -c "
+import sys, json
+ds = json.load(sys.stdin)
+desired = ds.get('status',{}).get('desiredNumberScheduled', 0)
+ready = ds.get('status',{}).get('numberReady', 0)
+unavail = ds.get('status',{}).get('numberUnavailable', 0)
+if unavail > 0:
+    print(f'FAIL:{unavail} of {desired} aws-node pods not ready — pod networking broken on those nodes')
+elif ready == desired and desired > 0:
+    print(f'PASS:aws-node DaemonSet healthy ({ready}/{desired} ready)')
+elif desired == 0:
+    print('WARN:aws-node DaemonSet has 0 desired pods')
+else:
+    print(f'WARN:aws-node DaemonSet {ready}/{desired} ready')
+" 2>/dev/null || echo "")
+    if [[ -n "$AWS_NODE_STATUS" ]]; then
+      _level="${AWS_NODE_STATUS%%:*}"
+      _msg="${AWS_NODE_STATUS#*:}"
+      case "$_level" in
+        PASS) ok "$_msg" ;;
+        FAIL) bad "$_msg"
+              add_issue "aws-node (VPC CNI) pods failing → references/node-diagnostics-detail.md § O (CNI / Pod Networking)" "P0"
+              CNI_ISSUES=$((CNI_ISSUES + 1))
+              ;;
+        WARN) warn "$_msg" ;;
+      esac
+    fi
+
+    CNI_CRASHES=$(kubectl get pods -n kube-system -l k8s-app=aws-node --no-headers 2>/dev/null \
+      | grep -iE "CrashLoopBackOff|Error|ImagePullBackOff" || true)
+    if [[ -n "$CNI_CRASHES" ]]; then
+      bad "aws-node pods in crash state:"
+      echo "$CNI_CRASHES" | while IFS= read -r line; do echo "    $line"; done
+      add_issue "aws-node CrashLoopBackOff — pod networking broken → references/node-diagnostics-detail.md § O (CNI / Pod Networking)" "P0"
+      CNI_ISSUES=$((CNI_ISSUES + 1))
+
+      CNI_LOGS=$(kubectl logs -n kube-system -l k8s-app=aws-node --tail=20 2>/dev/null | \
+        grep -iE "error|failed|refused|timeout|fatal|gRPC|ipamd|eni" | tail -5 || true)
+      if [[ -n "$CNI_LOGS" ]]; then
+        info "Recent aws-node error logs:"
+        echo "$CNI_LOGS" | while IFS= read -r line; do info "  $line"; done
+      fi
+    fi
+  else
+    info "aws-node DaemonSet not found in kube-system (may use alternate CNI)"
+  fi
+
+  # kube-proxy — if down, service networking breaks
+  KP_CRASHES=$(kubectl get pods -n kube-system -l k8s-app=kube-proxy --no-headers 2>/dev/null \
+    | grep -iE "CrashLoopBackOff|Error|ImagePullBackOff" || true)
+  if [[ -n "$KP_CRASHES" ]]; then
+    bad "kube-proxy pods in crash state:"
+    echo "$KP_CRASHES" | while IFS= read -r line; do echo "    $line"; done
+    add_issue "kube-proxy crash — service networking broken → references/node-diagnostics-detail.md § O (CNI / Pod Networking)" "P0"
+    CNI_ISSUES=$((CNI_ISSUES + 1))
+  fi
+
+  # CoreDNS — if down, DNS resolution fails (NCCL MASTER_ADDR, service discovery)
+  COREDNS_CRASHES=$(kubectl get pods -n kube-system -l k8s-app=kube-dns --no-headers 2>/dev/null \
+    | grep -iE "CrashLoopBackOff|Error|ImagePullBackOff" || true)
+  if [[ -n "$COREDNS_CRASHES" ]]; then
+    bad "CoreDNS pods in crash state — DNS resolution will fail:"
+    echo "$COREDNS_CRASHES" | while IFS= read -r line; do echo "    $line"; done
+    add_issue "CoreDNS crash — DNS broken → references/node-diagnostics-detail.md § O (CNI / Pod Networking)" "P0"
+    CNI_ISSUES=$((CNI_ISSUES + 1))
+  fi
+
+  [[ "$CNI_ISSUES" -eq 0 ]] && ok "kube-system networking pods healthy (aws-node, kube-proxy, CoreDNS)"
+fi
+
+if [[ "$ORCHESTRATOR" == "Slurm" ]]; then
+  section "4b. Slurm Node States"
+
+  if command -v sinfo &>/dev/null; then
+    SLURM_DOWN=$(sinfo -o "%N %T %30E" --noheader 2>/dev/null | grep -iE "down|drain|fail" || true)
+    if [[ -n "$SLURM_DOWN" ]]; then
+      bad "Slurm nodes with issues:"
+      echo "$SLURM_DOWN" | while IFS= read -r line; do
+        echo "    $line"
+      done
+      DOWN_COUNT=$(echo "$SLURM_DOWN" | grep -c .)
+      add_issue "$DOWN_COUNT Slurm node(s) down/drained → references/node-diagnostics-detail.md § H (Slurm Node Management)" "P1"
+    else
+      ok "All Slurm nodes show idle/alloc/mixed state"
+    fi
+
+    STUCK_JOBS=$(squeue -o "%i %j %T %R %N" --noheader 2>/dev/null | grep -iE "COMPLETING|CONFIGURING" || true)
+    if [[ -n "$STUCK_JOBS" ]]; then
+      warn "Stuck jobs detected (COMPLETING/CONFIGURING):"
+      echo "$STUCK_JOBS" | head -5 | while IFS= read -r line; do echo "    $line"; done
+      add_issue "Stuck Slurm jobs → references/node-diagnostics-detail.md § H (Slurm Node Management)" "P1"
+    fi
+  else
+    info "Slurm CLI not available locally — to check Slurm node states, SSM into the head node:"
+    info "  sinfo -o '%N %T %30E'"
+    info "  squeue -o '%i %j %T %R %N'"
+    info ""
+    info "Or use SSM to run remotely:"
+    if [[ -n "$CLUSTER_ID" ]]; then
+      HEAD_NODE=$(echo "$NODES_JSON" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    g=n.get('InstanceGroupName','').lower()
+    if any(x in g for x in ['controller','head','master']):
+        print(n.get('InstanceId','') + ' ' + n.get('InstanceGroupName',''))
+        break
+else:
+    for n in nodes:
+        if n.get('InstanceStatus',{}).get('Status')=='Running':
+            print(n.get('InstanceId','') + ' ' + n.get('InstanceGroupName',''))
+            break
+" 2>/dev/null || echo "")
+      if [[ -n "$HEAD_NODE" ]]; then
+        H_IID=$(echo "$HEAD_NODE" | awk '{print $1}')
+        H_GRP=$(echo "$HEAD_NODE" | awk '{print $2}')
+        info "  aws ssm start-session --target sagemaker-cluster:${CLUSTER_ID}_${H_GRP}-${H_IID} --region $REGION"
+      fi
+    fi
+    if command -v session-manager-plugin &>/dev/null && [[ -n "$HEAD_NODE" ]]; then
+      H_IID=$(echo "$HEAD_NODE" | awk '{print $1}')
+      # Validate instance ID format — defense-in-depth against unexpected input.
+      if [[ "$H_IID" =~ ^i-[0-9a-f]{8,17}$ ]]; then
+        info ""
+        info "Running Slurm checks via SSM on controller ${H_IID}..."
+        # Unique delimiter prevents false matches if check output happens to contain marker text.
+        local_nonce=$(date +%s%N 2>/dev/null || echo "$RANDOM")
+        SLURM_CHECK_SH=$(cat <<EOF
+echo SLURM_CHECK_START_${local_nonce}
+scontrol show config >/dev/null 2>&1 || echo SLURMCTLD_DOWN_${local_nonce}
+echo DOWN_NODES_${local_nonce}
+sinfo -o '%20N %10T %30E' --noheader 2>/dev/null | grep -iE 'down|drain|fail' | head -10
+echo END_DOWN_${local_nonce}
+echo STUCK_COUNT_${local_nonce}
+squeue -o '%i %T' --noheader 2>/dev/null | grep -cE 'COMPLETING|CONFIGURING' || echo 0
+echo MUNGE_${local_nonce}
+systemctl is-active munge 2>/dev/null || echo inactive
+echo SLURM_CHECK_END_${local_nonce}
+EOF
+)
+        SSM_STDOUT=$(ssm_run_on_node "$H_IID" "$H_GRP" "$SLURM_CHECK_SH" || echo "")
+        if [[ -z "$SSM_STDOUT" ]] || ! echo "$SSM_STDOUT" | grep -q "SLURM_CHECK_START_${local_nonce}"; then
+          warn "Slurm SSM probe returned no usable output — controller may be unreachable or SSM agent not responding"
+          add_issue "Slurm controller SSM probe failed → references/node-diagnostics-detail.md § K (Node Access via SSM) + § H (Slurm Node Management)" "P1"
+        fi
+        if echo "$SSM_STDOUT" | grep -q "SLURM_CHECK_START_${local_nonce}"; then
+          if echo "$SSM_STDOUT" | grep -q "SLURMCTLD_DOWN_${local_nonce}"; then
+            bad "slurmctld not responding on controller — all Slurm operations blocked"
+            add_issue "slurmctld down → references/node-diagnostics-detail.md § H (Slurm Node Management)" "P0"
+          else
+            ok "slurmctld responding"
+          fi
+          SSM_DOWN_LINES=$(echo "$SSM_STDOUT" | sed -n "/^DOWN_NODES_${local_nonce}\$/,/^END_DOWN_${local_nonce}\$/p" | grep -v "^DOWN_NODES_\|^END_DOWN_" | grep -v "^$" || true)
+          if [[ -n "$SSM_DOWN_LINES" ]]; then
+            bad "Slurm nodes with issues (via SSM):"
+            echo "$SSM_DOWN_LINES" | while IFS= read -r line; do info "  $line"; done
+            SSM_DOWN_COUNT=$(echo "$SSM_DOWN_LINES" | grep -c .)
+            add_issue "$SSM_DOWN_COUNT Slurm node(s) down/drained → references/node-diagnostics-detail.md § H (Slurm Node Management)" "P1"
+          else
+            ok "All Slurm nodes healthy (via SSM)"
+          fi
+          STUCK_COUNT=$(echo "$SSM_STDOUT" | sed -n "/^STUCK_COUNT_${local_nonce}\$/{n;p;}" | tr -d '[:space:]')
+          [[ "${STUCK_COUNT:-0}" =~ ^[0-9]+$ ]] && [[ "${STUCK_COUNT:-0}" -gt 0 ]] && \
+            add_issue "$STUCK_COUNT stuck Slurm jobs → references/node-diagnostics-detail.md § H (Slurm Node Management)" "P1"
+          if echo "$SSM_STDOUT" | sed -n "/^MUNGE_${local_nonce}\$/{n;p;}" | grep -q inactive; then
+            bad "munge authentication service inactive on controller"
+            add_issue "munge service inactive → references/node-diagnostics-detail.md § H (Slurm Node Management)" "P0"
+          fi
+        fi
+      fi
+    fi
+  fi
+fi
+
+section "5. Cluster VPC Resources"
+
+RESOURCES=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+vpc=d.get('VpcConfig',{})
+sgs=vpc.get('SecurityGroupIds',[])
+subnets=vpc.get('Subnets',[])
+print('SGs=' + ','.join(sgs))
+print('Subnets=' + ','.join(subnets))
+" 2>/dev/null || echo "")
+
+CLUSTER_SGS=$(echo "$RESOURCES"     | grep "^SGs="     | cut -d= -f2)
+CLUSTER_SUBNETS=$(echo "$RESOURCES" | grep "^Subnets=" | cut -d= -f2)
+
+if [[ -n "$CLUSTER_SGS" ]]; then
+  echo -e "  Security Groups: ${BOLD}${CLUSTER_SGS}${NC}"
+  echo -e "  Subnets:         ${BOLD}${CLUSTER_SUBNETS}${NC}"
+
+  for SG in $(echo "$CLUSTER_SGS" | tr ',' ' '); do
+    # Nested JMESPath filter `UserIdGroupPairs[?GroupId=='...']` inside an
+    # already-filtered projection returns empty under AWS CLI even when the
+    # rule is present — false-flags healthy SGs as a P0. Flatten the array
+    # and match in bash instead.
+    _SG_RESULT=$(aws ec2 describe-security-groups \
+      --group-ids "$SG" --region "$REGION" \
+      --cli-read-timeout 15 \
+      --query "SecurityGroups[0].IpPermissionsEgress[?IpProtocol=='-1'].UserIdGroupPairs[].GroupId" \
+      --output text 2>&1)
+    if aws_check_perms "$_SG_RESULT" "ec2:DescribeSecurityGroups"; then
+      info "SG check skipped for $SG (permission denied)"
+      continue
+    fi
+    if echo "$_SG_RESULT" | tr '\t' '\n' | grep -qxF "$SG"; then
+      ok "SG ${SG} has outbound self-referencing rule (EFA ready)"
+    else
+      bad "SG ${SG} missing outbound self-referencing rule → EFA will fail"
+      add_issue "Missing SG outbound self-ref rule on ${SG} → references/node-diagnostics-detail.md § A (EFA / Security Group)" "P0"
+    fi
+  done
+
+  if [[ -n "$CLUSTER_SUBNETS" ]]; then
+    echo ""
+    # shellcheck disable=SC2046  # intentional word splitting for multiple subnet IDs
+    IFS=',' read -ra _subnet_arr <<< "$CLUSTER_SUBNETS"
+    _SUB_RESULT=$(aws ec2 describe-subnets \
+      --subnet-ids "${_subnet_arr[@]}" \
+      --region "$REGION" \
+      --cli-read-timeout 15 \
+      --query 'Subnets[*].{SubnetId:SubnetId,AZ:AvailabilityZone,FreeIPs:AvailableIpAddressCount}' \
+      --output table 2>&1)
+    if ! aws_check_perms "$_SUB_RESULT" "ec2:DescribeSubnets"; then
+      echo "$_SUB_RESULT"
+    fi
+  fi
+else
+  warn "No VpcConfig found in cluster — cluster may not have customer VPC"
+fi
+
+section "6. CloudWatch Logs"
+
+if [[ -n "$CLUSTER_ID" ]]; then
+  CLUSTER_NAME_ONLY=$(echo "$CLUSTER" | awk -F/ '{print $NF}')
+  LOG_GROUP="/aws/sagemaker/Clusters/${CLUSTER_NAME_ONLY}/${CLUSTER_ID}"
+  echo -e "  Log group: ${LOG_GROUP}"
+
+  _LOG_RESULT=$(aws logs describe-log-groups \
+    --log-group-name-prefix "$LOG_GROUP" \
+    --region "$REGION" \
+    --cli-read-timeout 15 \
+    --query 'logGroups[0].logGroupName' \
+    --output text 2>&1)
+  if aws_check_perms "$_LOG_RESULT" "logs:DescribeLogGroups"; then
+    LOG_EXISTS="None"
+  else
+    LOG_EXISTS="$_LOG_RESULT"
+  fi
+
+  if [[ "$LOG_EXISTS" == "None" || -z "$LOG_EXISTS" ]]; then
+    warn "No CloudWatch log group found — logs may not be configured or cluster is new"
+    info "Expected: $LOG_GROUP"
+  else
+    ok "Log group exists: $LOG_EXISTS"
+
+    # Count recent log streams — paginate so the count reflects all streams,
+    # not just the first 50 (default CloudWatch page size).
+    STREAM_COUNT=0
+    _LS_TOKEN=""; _LS_I=0
+    while (( _LS_I < 20 )); do
+      # Validate token format before sending — avoid BadRequest on garbage.
+      if [[ -n "$_LS_TOKEN" && "$_LS_TOKEN" =~ ^[a-zA-Z0-9/+]*={0,2}$ ]]; then
+        _LS_PAGE=$(aws logs describe-log-streams --log-group-name "$LOG_GROUP" \
+          --region "$REGION" --cli-read-timeout 15 --limit 50 --next-token "$_LS_TOKEN" \
+          --output json 2>/dev/null) || break
+      else
+        _LS_PAGE=$(aws logs describe-log-streams --log-group-name "$LOG_GROUP" \
+          --region "$REGION" --cli-read-timeout 15 --limit 50 \
+          --output json 2>/dev/null) || break
+      fi
+      _LS_INC=$(echo "$_LS_PAGE" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('logStreams',[])))" 2>/dev/null || echo 0)
+      STREAM_COUNT=$((STREAM_COUNT + _LS_INC))
+      _LS_TOKEN=$(echo "$_LS_PAGE" | python3 -c "import sys,json; print(json.load(sys.stdin).get('nextToken',''))" 2>/dev/null || echo "")
+      _LS_I=$((_LS_I + 1))
+      [[ -z "$_LS_TOKEN" ]] && break
+    done
+    info "$STREAM_COUNT log stream(s) available"
+    info "To view: aws logs describe-log-streams --log-group-name \"$LOG_GROUP\" --region $REGION --output table"
+  fi
+fi
+
+section "7. SSM Connectivity"
+
+if command -v session-manager-plugin &>/dev/null; then
+  # `command -v` only verifies the binary exists — run --version to confirm it
+  # actually works (permissions, broken install, etc.).
+  if SSM_VER=$(session-manager-plugin --version 2>/dev/null); then
+    ok "SSM Session Manager plugin installed (${SSM_VER})"
+  else
+    warn "SSM Session Manager plugin installed but --version failed — plugin may be corrupt or missing libs"
+    add_issue "SSM plugin installed but broken → references/node-diagnostics-detail.md § K (Node Access via SSM)" "P1"
+  fi
+else
+  warn "SSM Session Manager plugin NOT found"
+  info "Install: https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html"
+  add_issue "SSM plugin missing → references/node-diagnostics-detail.md § K (Node Access via SSM)" "P2"
+fi
+
+RUNNING_IDS=$(echo "$NODES_JSON" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+ids=[n.get('InstanceId') for n in nodes if n.get('InstanceStatus',{}).get('Status')=='Running']
+print(','.join(ids[:3]))
+" 2>/dev/null || echo "")
+
+if [[ -n "$RUNNING_IDS" ]]; then
+  ok "Running nodes available for SSM (examples: ${RUNNING_IDS})"
+  info "Use hyperpod-ssm skill with cluster ID: ${CLUSTER_ID}"
+else
+  warn "No Running nodes found — SSM access not possible until nodes are healthy"
+fi
+
+# 8: On-Node Resource Checks (Memory / Storage / Utilities)
+# Runs via SSM on the target node (or first running node) to detect resource
+# exhaustion issues that only show up on-node: disk full, /dev/shm too small,
+# huge pages misconfigured, OOM signals.
+
+NODE_TO_PROBE="${TARGET_NODE}"
+NODE_TO_PROBE_GROUP=""
+
+if [[ -z "$NODE_TO_PROBE" ]]; then
+  NODE_TO_PROBE=$(echo "$NODES_JSON" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    if n.get('InstanceStatus',{}).get('Status')=='Running':
+        print(n.get('InstanceId',''))
+        break
+" 2>/dev/null || echo "")
+fi
+
+if [[ -n "$NODE_TO_PROBE" ]]; then
+  NODE_TO_PROBE_GROUP=$(echo "$NODES_JSON" | NODE_ID_ENV="$NODE_TO_PROBE" python3 -c "
+import sys,json,os
+target=os.environ['NODE_ID_ENV']
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    if n.get('InstanceId','')==target:
+        print(n.get('InstanceGroupName',''))
+        break
+" 2>/dev/null || echo "")
+fi
+
+if [[ -n "$NODE_TO_PROBE" ]] \
+    && [[ "$NODE_TO_PROBE" =~ ^i-[0-9a-f]{8,17}$ ]] \
+    && [[ -n "$NODE_TO_PROBE_GROUP" ]] \
+    && command -v session-manager-plugin &>/dev/null; then
+  section "8. On-Node Resource Checks (via SSM)"
+  info "Probing node: $NODE_TO_PROBE (group: ${NODE_TO_PROBE_GROUP})"
+
+  resource_nonce=$(date +%s%N 2>/dev/null || echo "$RANDOM")
+  RESOURCE_SH=$(cat <<EOF
+echo RESOURCE_CHECK_START_${resource_nonce}
+echo DISK_ROOT_${resource_nonce}
+df -h / 2>/dev/null | tail -1
+echo DISK_OPT_${resource_nonce}
+df -h /opt/sagemaker 2>/dev/null | tail -1 || echo NOT_MOUNTED
+echo DISK_NVME_${resource_nonce}
+df -h /opt/dlami/nvme 2>/dev/null | tail -1 || echo NOT_MOUNTED
+echo SHM_SIZE_${resource_nonce}
+df -h /dev/shm 2>/dev/null | tail -1
+echo MEMORY_INFO_${resource_nonce}
+free -h | grep Mem
+echo HUGEPAGES_${resource_nonce}
+cat /proc/meminfo 2>/dev/null | grep -i huge | head -5
+echo EFA_HUGE_PAGE_${resource_nonce}
+env 2>/dev/null | grep FI_EFA_USE_HUGE_PAGE || echo NOT_SET
+echo OOM_RECENT_${resource_nonce}
+dmesg 2>/dev/null | grep -iE 'oom|out of memory|cannot allocate' | tail -5 || echo NONE
+echo INODE_CHECK_${resource_nonce}
+df -i / 2>/dev/null | tail -1
+echo TIME_SYNC_${resource_nonce}
+chronyc tracking 2>/dev/null | grep -E 'System time|Leap status' || timedatectl status 2>/dev/null | grep -E 'synchronized|NTP service' || echo UNKNOWN
+echo SSM_AGENT_${resource_nonce}
+systemctl is-active amazon-ssm-agent 2>/dev/null || echo inactive
+echo NVME_MOUNTS_${resource_nonce}
+lsblk -nr -o NAME,MOUNTPOINT 2>/dev/null | grep -E 'nvme[0-9]+n[0-9]+\$' | head -10 || echo NONE
+echo GPU_XID_${resource_nonce}
+if command -v nvidia-smi >/dev/null 2>&1; then
+  # XIDs live in TWO places and modern A100/H100 drivers log them differently:
+  #   - kernel ring buffer (dmesg):     'NVRM: Xid (PCI:...): <code>, ...'
+  #   - nvidia-smi -q persistent state: Uncorrectable/Pending rows (summary)
+  # Earlier versions of this check only looked at nvidia-smi -q and MISSED every
+  # live XID event — verified on-hardware with a driver-580.126 A100 where
+  # XID 31 landed in dmesg but nvidia-smi -q never surfaced it.
+  _gpu_xid_out=\$(
+    dmesg 2>/dev/null | grep -E 'NVRM: Xid' | tail -10
+    nvidia-smi -q 2>/dev/null | awk '
+      /Uncorrectable/           { if (\$NF ~ /^[0-9]+\$/ && \$NF+0 > 0) print; next }
+      /Pending Page Blacklist/  { if (\$NF ~ /^[0-9]+\$/ && \$NF+0 > 0) print; next }
+    '
+  )
+  if [[ -z "\$_gpu_xid_out" ]]; then echo NONE; else echo "\$_gpu_xid_out" | head -20; fi
+else
+  echo NO_NVIDIA_SMI
+fi
+echo GPU_REMAP_${resource_nonce}
+# Row-remap state: 'Pending' rows indicate marginal GPU memory that needs a reset
+# to finalize the remap. If remap is reported Failed, the GPU is bad.
+# A stuck 'Pending' state across reboots is a known firmware edge case that can
+# silently degrade training without NCCL/DCGM flagging it — capture explicitly.
+if command -v nvidia-smi >/dev/null 2>&1; then
+  nvidia-smi --query-remapped-rows=gpu_bus_id,remapped_rows.correctable,remapped_rows.uncorrectable,remapped_rows.pending,remapped_rows.failure \
+    --format=csv,noheader 2>/dev/null | head -16 || echo UNSUPPORTED
+else
+  echo NO_NVIDIA_SMI
+fi
+echo GPU_DCGM_${resource_nonce}
+# DCGM health summary. Presence of 'Health Monitor Report' + 'PASS'/'Warn'/'Fail'
+# tells us DCGM has run recently. Absence is informational, not an error.
+# Row-remap errors surface here on drivers where nvidia-smi lags the firmware.
+if command -v dcgmi >/dev/null 2>&1; then
+  dcgmi health --check -j 2>/dev/null | head -40 || dcgmi health --check 2>/dev/null | head -20 || echo DCGM_UNAVAILABLE
+else
+  echo NO_DCGMI
+fi
+echo GPU_DCGM_LOGS_${resource_nonce}
+# DCGM nvvs log presence — SageMaker HyperPod runs DCGM medium/memtest as part
+# of deep-health-check. If this log is present the node has been health-checked
+# recently; tail captures last run result.
+if [ -d /var/log/nvidia-dcgm ] 2>/dev/null; then
+  find /var/log/nvidia-dcgm -maxdepth 1 -type f -printf '%f\n' 2>/dev/null | head -5
+  # \$ escapes are required: this heredoc is <<EOF (not <<'EOF'), so unescaped
+  # shell variables would expand locally. Keep \$ to defer to the remote shell.
+  NVVS_LATEST=\$(find /var/log/nvidia-dcgm -maxdepth 1 -name 'nvvs*.log' -printf '%T@ %p\n' 2>/dev/null | sort -nr | head -1 | awk '{print \$2}')
+  if [ -n "\$NVVS_LATEST" ]; then
+    echo "--- tail of \$NVVS_LATEST ---"
+    tail -n 5 "\$NVVS_LATEST" 2>/dev/null || true
+  fi
+else
+  echo NO_DCGM_LOG_DIR
+fi
+echo KERNEL_PANIC_${resource_nonce}
+dmesg 2>/dev/null | grep -iE 'Kernel panic - not syncing|watchdog: BUG|soft lockup|hard lockup|hung_task: blocked|BUG: unable to handle|BUG: kernel NULL|NMI watchdog' | tail -10 || echo NONE
+echo CONTAINERD_${resource_nonce}
+if command -v systemctl >/dev/null 2>&1; then
+  systemctl is-active containerd 2>/dev/null || echo inactive
+else
+  echo UNKNOWN
+fi
+echo RESOURCE_CHECK_END_${resource_nonce}
+EOF
+)
+  RES_STDOUT=$(ssm_run_on_node "$NODE_TO_PROBE" "$NODE_TO_PROBE_GROUP" "$RESOURCE_SH" || echo "")
+
+  extract_section() {
+    local start="$1" end="$2"
+    # grep -v returns 1 when every line is filtered out; under pipefail this
+    # kills the pipeline even though the EMPTY output is legitimate. Force 0.
+    { echo "$RES_STDOUT" | sed -n "/^${start}_${resource_nonce}\$/,/^${end}_${resource_nonce}\$/p" \
+      | grep -v "^${start}_${resource_nonce}\$\|^${end}_${resource_nonce}\$" || true; }
+  }
+
+  if echo "$RES_STDOUT" | grep -q "RESOURCE_CHECK_START_${resource_nonce}"; then
+    echo ""
+    echo -e "  ${BOLD}Storage:${NC}"
+    ROOT_LINE=$(extract_section DISK_ROOT DISK_OPT | head -1)
+    if [[ -n "$ROOT_LINE" ]]; then
+      ROOT_USE_PCT=$(echo "$ROOT_LINE" | awk '{print $5}' | tr -d '%')
+      if [[ "$ROOT_USE_PCT" =~ ^[0-9]+$ ]] && [[ "$ROOT_USE_PCT" -gt 90 ]]; then
+        bad "Root volume: ${ROOT_USE_PCT}% used — CRITICALLY FULL (100GB fixed, cannot expand)"
+        add_issue "Root volume ${ROOT_USE_PCT}% full → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P0"
+      elif [[ "$ROOT_USE_PCT" =~ ^[0-9]+$ ]] && [[ "$ROOT_USE_PCT" -gt 80 ]]; then
+        warn "Root volume: ${ROOT_USE_PCT}% used — approaching full"
+        add_issue "Root volume ${ROOT_USE_PCT}% used → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+      else
+        ok "Root volume: ${ROOT_USE_PCT:-?}% used"
+      fi
+    fi
+
+    OPT_LINE=$(extract_section DISK_OPT DISK_NVME | head -1)
+    if [[ "$OPT_LINE" != "NOT_MOUNTED" && -n "$OPT_LINE" ]]; then
+      OPT_USE=$(echo "$OPT_LINE" | awk '{print $5}' | tr -d '%')
+      if [[ "$OPT_USE" =~ ^[0-9]+$ ]] && [[ "$OPT_USE" -gt 90 ]]; then
+        warn "/opt/sagemaker: ${OPT_USE}% used — secondary EBS nearing full"
+        add_issue "/opt/sagemaker ${OPT_USE}% full → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+      else
+        ok "/opt/sagemaker: ${OPT_USE:-?}% used"
+      fi
+    fi
+
+    NVME_LINE=$(extract_section DISK_NVME SHM_SIZE | head -1)
+    if [[ "$NVME_LINE" != "NOT_MOUNTED" && -n "$NVME_LINE" ]]; then
+      ok "NVMe instance store: mounted at /opt/dlami/nvme"
+    else
+      # On GPU training instances NVMe is expected — flag if not mounted
+      INSTANCE_TYPE_LOC=$(echo "$NODES_JSON" | NODE_ID_ENV="$NODE_TO_PROBE" python3 -c "
+import sys,json,os
+target=os.environ['NODE_ID_ENV']
+for n in json.load(sys.stdin).get('ClusterNodeSummaries',[]):
+    if n.get('InstanceId','')==target:
+        print(n.get('InstanceType',''))
+        break
+" 2>/dev/null || echo "")
+      if [[ "$INSTANCE_TYPE_LOC" =~ ^ml\.(p5|p5e|p5en|p4d|p4de|p6|trn1|trn2)\. ]]; then
+        warn "/opt/dlami/nvme not mounted on $INSTANCE_TYPE_LOC — instance store expected"
+        add_issue "NVMe instance store not mounted on $NODE_TO_PROBE ($INSTANCE_TYPE_LOC) → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+      fi
+    fi
+
+    INODE_LINE=$(extract_section INODE_CHECK TIME_SYNC | head -1)
+    if [[ -n "$INODE_LINE" ]]; then
+      INODE_PCT=$(echo "$INODE_LINE" | awk '{print $5}' | tr -d '%')
+      if [[ "$INODE_PCT" =~ ^[0-9]+$ ]] && [[ "$INODE_PCT" -gt 90 ]]; then
+        bad "Inode usage: ${INODE_PCT}% — filesystem running out of inodes"
+        add_issue "Inode exhaustion ${INODE_PCT}% → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+      fi
+    fi
+
+    echo ""
+    echo -e "  ${BOLD}Memory:${NC}"
+    MEM_LINE=$(extract_section MEMORY_INFO HUGEPAGES | head -1)
+    [[ -n "$MEM_LINE" ]] && info "RAM: $MEM_LINE"
+
+    SHM_LINE=$(extract_section SHM_SIZE MEMORY_INFO | head -1)
+    if [[ -n "$SHM_LINE" ]]; then
+      SHM_SIZE=$(echo "$SHM_LINE" | awk '{print $2}')
+      SHM_USE_PCT=$(echo "$SHM_LINE" | awk '{print $5}' | tr -d '%')
+      if [[ "$SHM_USE_PCT" =~ ^[0-9]+$ ]] && [[ "$SHM_USE_PCT" -gt 80 ]]; then
+        warn "/dev/shm: ${SHM_USE_PCT}% used (size: $SHM_SIZE) — NCCL may fail with 'Bus error'"
+        add_issue "/dev/shm ${SHM_USE_PCT}% full → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+      else
+        ok "/dev/shm: ${SHM_USE_PCT:-?}% used (size: ${SHM_SIZE:-?})"
+      fi
+    fi
+
+    EFA_HP=$(extract_section EFA_HUGE_PAGE OOM_RECENT | head -1)
+    if [[ "$EFA_HP" == "NOT_SET" ]]; then
+      HUGEPAGES_TOTAL=$(extract_section HUGEPAGES EFA_HUGE_PAGE | { grep "HugePages_Total" || true; } | awk '{print $2}')
+      if [[ "${HUGEPAGES_TOTAL:-0}" == "0" ]]; then
+        warn "FI_EFA_USE_HUGE_PAGE not set and HugePages_Total=0"
+        add_issue "FI_EFA_USE_HUGE_PAGE not configured → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P2"
+      fi
+    elif echo "$EFA_HP" | grep -q "=0"; then
+      ok "FI_EFA_USE_HUGE_PAGE=0 (huge pages disabled for EFA — os.fork() safe)"
+    fi
+
+    OOM_LINES=$(extract_section OOM_RECENT INODE_CHECK | { grep -v "^NONE$" || true; } | head -3)
+    if [[ -n "$OOM_LINES" ]]; then
+      echo ""
+      bad "Recent OOM events detected on node:"
+      echo "$OOM_LINES" | while IFS= read -r line; do info "  $line"; done
+      add_issue "OOM events on node $NODE_TO_PROBE → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+    else
+      echo ""
+      ok "No recent OOM events"
+    fi
+
+    # Time sync health — clock drift breaks TLS/SigV4 and Slurm accounting.
+    TIME_STATUS=$(extract_section TIME_SYNC SSM_AGENT | head -3)
+    if echo "$TIME_STATUS" | grep -qiE "synchronized: no|Not synchronised|UNKNOWN"; then
+      warn "Time sync unhealthy — chronyc/timedatectl reports not synchronised"
+      info "Clock drift breaks TLS/IAM (SigV4) and Slurm accounting"
+      add_issue "Node $NODE_TO_PROBE time sync not healthy → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+    elif [[ -n "$TIME_STATUS" ]]; then
+      ok "Time sync healthy"
+    fi
+
+    # SSM agent health — if we got here it's mostly working, but flag if systemd says otherwise.
+    SSM_AGENT_STATUS=$(extract_section SSM_AGENT NVME_MOUNTS | head -1)
+    if [[ "$SSM_AGENT_STATUS" == "inactive" ]]; then
+      warn "amazon-ssm-agent reported inactive — may be restarting or broken"
+      add_issue "amazon-ssm-agent inactive on $NODE_TO_PROBE → references/node-diagnostics-detail.md § K (Node Access via SSM)" "P1"
+    fi
+
+    # GPU XID / ECC — hardware faults visible via nvidia-smi query.
+    GPU_XID_LINES=$(extract_section GPU_XID GPU_REMAP | { grep -v "^NONE$" || true; } | { grep -v "^NO_NVIDIA_SMI$" || true; } | head -5)
+    if [[ -n "$GPU_XID_LINES" ]]; then
+      echo ""
+      bad "GPU XID / ECC / blacklist signals on node $NODE_TO_PROBE:"
+      echo "$GPU_XID_LINES" | while IFS= read -r line; do info "  $line"; done
+      add_issue "GPU XID / ECC / blacklist on $NODE_TO_PROBE → references/node-diagnostics-detail.md § G (GPU/Accelerator) + § F (Hardware / Auto-Repair)" "P0"
+    fi
+
+    # GPU row-remapping — marginal GPU memory. Pending rows that never clear
+    # indicate a firmware edge case where the remap is stuck; Failed rows mean
+    # the GPU is bad and must be replaced. Silent degrader — NCCL and DCGM's
+    # default checks can miss this.
+    GPU_REMAP_LINES=$(extract_section GPU_REMAP GPU_DCGM | { grep -v "^NO_NVIDIA_SMI$" || true; } | { grep -v "^UNSUPPORTED$" || true; })
+    if [[ -n "$GPU_REMAP_LINES" ]]; then
+      # Columns (csv,noheader): gpu_bus_id, correctable, uncorrectable, pending, failure
+      REMAP_PENDING_TOTAL=0
+      REMAP_FAILED_TOTAL=0
+      REMAP_UNCORRECT_TOTAL=0
+      while IFS= read -r line; do
+        [[ -z "$line" ]] && continue
+        _p=$(echo "$line" | awk -F, '{gsub(/ /,""); print $4}')
+        _f=$(echo "$line" | awk -F, '{gsub(/ /,""); print $5}')
+        _u=$(echo "$line" | awk -F, '{gsub(/ /,""); print $3}')
+        [[ "$_p" =~ ^[0-9]+$ ]] && REMAP_PENDING_TOTAL=$((REMAP_PENDING_TOTAL + _p))
+        [[ "$_u" =~ ^[0-9]+$ ]] && REMAP_UNCORRECT_TOTAL=$((REMAP_UNCORRECT_TOTAL + _u))
+        [[ "$_f" == "Yes" || "$_f" == "1" ]] && REMAP_FAILED_TOTAL=$((REMAP_FAILED_TOTAL + 1))
+      done <<< "$GPU_REMAP_LINES"
+      if [[ "$REMAP_FAILED_TOTAL" -gt 0 ]]; then
+        bad "GPU row-remap FAILED on $REMAP_FAILED_TOTAL device(s) — GPU has exceeded remap capacity"
+        add_issue "GPU row-remap failure on $NODE_TO_PROBE (bad memory, replace GPU) → references/node-diagnostics-detail.md § G (GPU/Accelerator) + § F (Hardware / Auto-Repair)" "P0"
+      elif [[ "$REMAP_PENDING_TOTAL" -gt 0 ]]; then
+        bad "GPU row-remap PENDING — $REMAP_PENDING_TOTAL row(s) awaiting reset"
+        info "  Pending remaps indicate marginal memory that a GPU reset/reboot should finalize."
+        info "  If pending persists across reboots, the firmware may be stuck (known edge case) — escalate."
+        add_issue "GPU row-remap pending on $NODE_TO_PROBE (reset/reboot to finalize; if stuck, marginal memory) → references/node-diagnostics-detail.md § G (GPU/Accelerator) + § F (Hardware / Auto-Repair)" "P1"
+      elif [[ "$REMAP_UNCORRECT_TOTAL" -gt 0 ]]; then
+        warn "GPU has $REMAP_UNCORRECT_TOTAL uncorrectable remapped rows (healthy now, but history of faults)"
+      fi
+    fi
+
+    # DCGM <= 3.3.9 memtest can incorrectly Pass; flag Warn/Fail only and let
+    # row-remap / XID checks above be authoritative.
+    GPU_DCGM_LINES=$(extract_section GPU_DCGM GPU_DCGM_LOGS | { grep -v "^NO_DCGMI$" || true; } | { grep -v "^DCGM_UNAVAILABLE$" || true; })
+    if [[ -n "$GPU_DCGM_LINES" ]]; then
+      if echo "$GPU_DCGM_LINES" | grep -qiE '"overall_health"\s*:\s*"(Fail|Warn)"|HEALTH_RESULT_FAIL|HEALTH_RESULT_WARN|Health Monitor Report.*(Fail|Warn)'; then
+        bad "DCGM health check reported Fail/Warn on $NODE_TO_PROBE"
+        add_issue "DCGM health Fail/Warn on $NODE_TO_PROBE → references/node-diagnostics-detail.md § G (GPU/Accelerator)" "P0"
+      fi
+    fi
+
+    # DCGM log presence — informational. Confirms deep-health-check history.
+    GPU_DCGM_LOG_LINES=$(extract_section GPU_DCGM_LOGS KERNEL_PANIC)
+    if echo "$GPU_DCGM_LOG_LINES" | grep -qi "nvvs"; then
+      ok "DCGM nvvs logs present on $NODE_TO_PROBE (/var/log/nvidia-dcgm/)"
+      if echo "$GPU_DCGM_LOG_LINES" | grep -qE "^--- tail"; then
+        DCGM_TAIL=$(echo "$GPU_DCGM_LOG_LINES" | sed -n '/^--- tail/,$p' | head -20)
+        if echo "$DCGM_TAIL" | grep -qiE 'FAIL|Error:|row ?remap.*(pending|fail)'; then
+          warn "DCGM nvvs log tail contains failure/row-remap signals — inspect on node:"
+          echo "$DCGM_TAIL" | while IFS= read -r line; do info "  $line"; done
+          add_issue "DCGM nvvs log shows failure/row-remap signals on $NODE_TO_PROBE → references/node-diagnostics-detail.md § G (GPU/Accelerator)" "P0"
+        fi
+      fi
+    fi
+
+    # Kernel panic / watchdog / hung task signals — indicate node-level instability.
+    KERNEL_PANIC_LINES=$(extract_section KERNEL_PANIC CONTAINERD | { grep -v "^NONE$" || true; } | head -5)
+    if [[ -n "$KERNEL_PANIC_LINES" ]]; then
+      echo ""
+      bad "Kernel panic / watchdog / hung_task signals on node $NODE_TO_PROBE:"
+      echo "$KERNEL_PANIC_LINES" | while IFS= read -r line; do info "  $line"; done
+      add_issue "Kernel panic / watchdog on $NODE_TO_PROBE → references/node-diagnostics-detail.md § N (Kernel & System)" "P0"
+    fi
+
+    # containerd health — if the runtime is inactive, every pod on this node fails.
+    CONTAINERD_STATUS=$(extract_section CONTAINERD RESOURCE_CHECK_END | head -1)
+    if [[ "$CONTAINERD_STATUS" == "inactive" ]]; then
+      warn "containerd is inactive on $NODE_TO_PROBE — all pods on this node will fail"
+      add_issue "containerd inactive on $NODE_TO_PROBE → references/node-diagnostics-detail.md § M (Container Runtime)" "P0"
+    fi
+
+  else
+    warn "SSM command returned no output — node may not be reachable"
+    add_issue "Cannot reach node $NODE_TO_PROBE via SSM → references/node-diagnostics-detail.md § K (Node Access via SSM)" "P1"
+  fi
+else
+  if [[ -z "$NODE_TO_PROBE" ]]; then
+    info "No running nodes to probe for resource checks"
+  else
+    info "SSM plugin not installed — skipping on-node resource checks → references/node-diagnostics-detail.md § K (Node Access via SSM)"
+  fi
+fi
+
+if [[ "$ORCHESTRATOR" == "Slurm" && "$TOTAL_NODES" -gt 0 ]]; then
+  section "8b. Slurm Node Mapping"
+  info "Slurm node name → HyperPod instance ID mapping:"
+  echo ""
+  echo "$NODES_JSON" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+print(f'  {\"Instance ID\":<22} {\"Group\":<20} {\"Private DNS (Slurm name)\":<30} {\"Status\"}')
+print(f'  {\"─\"*22} {\"─\"*20} {\"─\"*30} {\"─\"*10}')
+for n in nodes[:20]:
+    iid=n.get('InstanceId','?')
+    grp=n.get('InstanceGroupName','?')
+    dns=n.get('PrivateDnsHostname',n.get('PrivateDnsName','?'))
+    st=n.get('InstanceStatus',{}).get('Status','?')
+    print(f'  {iid:<22} {grp:<20} {dns:<30} {st}')
+if len(nodes) > 20:
+    print(f'  ... and {len(nodes)-20} more nodes')
+" 2>/dev/null
+  echo ""
+  info "To find a specific node: aws sagemaker list-cluster-nodes --cluster-name $CLUSTER --region $REGION --query 'ClusterNodeSummaries[*].{ID:InstanceId,DNS:PrivateDnsHostname,Group:InstanceGroupName}' --output table"
+fi
+
+section "9. Triage Summary"
+
+echo ""
+if [[ ${#ISSUES_FOUND[@]} -eq 0 ]]; then
+  echo -e "  ${GREEN}${BOLD}No critical issues detected from available signals.${NC}"
+  echo ""
+  echo "  Next steps:"
+  echo "  • If cluster is still failing: check cluster events above for error details"
+  echo "  • For node-level issues: use hyperpod-ssm skill to inspect nodes directly"
+  echo "  • For EFA issues: bash scripts/check-efa-sg.sh --cluster ${CLUSTER} --region ${REGION}"
+else
+  echo -e "  ${RED}${BOLD}Issues found (${#ISSUES_FOUND[@]}):${NC}"
+  echo ""
+  for priority in P0 P1 P2; do
+    has_items=false
+    for issue in "${ISSUES_FOUND[@]}"; do
+      if [[ "$issue" == "${priority}|"* ]]; then
+        if ! "$has_items"; then
+          case "$priority" in
+            P0) echo -e "  ${RED}${BOLD}[$priority — Fix Immediately]${NC}" ;;
+            P1) echo -e "  ${YELLOW}${BOLD}[$priority — Fix Soon]${NC}" ;;
+            P2) echo -e "  ${BOLD}[$priority — Informational]${NC}" ;;
+          esac
+          has_items=true
+        fi
+        echo -e "    → ${issue#*|}"
+      fi
+    done
+  done
+  echo ""
+  echo -e "  ${BOLD}Recommended next steps:${NC}"
+  echo "  1. Address P0 issues first, then P1. Each issue above includes a"
+  echo "     pointer of the form '→ references/node-diagnostics-detail.md § X'."
+  echo "  2. The hyperpod-node-debugger skill will open the referenced section"
+  echo "     and guide you through the fix with explicit approval."
+  echo "  3. After fixing, re-run: bash scripts/triage-cluster.sh --cluster ${CLUSTER} --region ${REGION}"
+  echo "  4. For shell access on nodes, use the hyperpod-ssm skill."
+fi
+
+echo ""
+echo -e "${BOLD}Cluster: ${CLUSTER}  |  Region: ${REGION}  |  Orchestrator: ${ORCHESTRATOR}${NC}"
+echo ""
diff --git a/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/SKILL.md
new file mode 100644
index 00000000..d4e37721
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/SKILL.md
@@ -0,0 +1,298 @@
+---
+name: hyperpod-performance-debugger
+description: Diagnoses three performance issues on SageMaker HyperPod — uneven NCCL bandwidth across nodes, poor filesystem throughput, and suspected GPU failure. Triggers on uneven NCCL, straggler node, FSx slow, checkpoint slow, dataloader slow, GPU failure, ECC, Xid, thermal throttling, UltraCluster placement, DCGM diagnostics, hardware replacement.
+metadata:
+  version: "1.0.0"
+---
+
+# HyperPod Performance Debugger
+
+Narrow-scope triage for three performance symptoms on HyperPod:
+
+1. **Uneven NCCL performance** across nodes — topology variance or degraded EFA on one or more nodes
+2. **Poor filesystem performance** — FSx / EBS throughput saturation, I/O bottleneck
+3. **Suspected GPU failure** — ECC, thermal throttling, Xid errors, hardware replacement
+
+The scope here is deliberately **narrow**: identify which of the three categories applies, run the category-specific diagnostic pass, then delegate to the right deep-dive skill for the fix.
+
+| Category                    | After triage, delegate to                                   |
+| --------------------------- | ----------------------------------------------------------- |
+| Uneven NCCL performance     | `hyperpod-nccl`; also `hyperpod-mfu-debugger` if MFU-driven |
+| Poor filesystem performance | `hyperpod-mfu-debugger` (see its `optimization-guide.md`)   |
+| Suspected GPU failure       | `hyperpod-node-debugger` § G (GPU) → § F (replacement)      |
+
+All on-node commands run via the `hyperpod-ssm` skill.
+
+> **Destructive-action policy**
+> Operate in read-only mode by default. Several remediation commands referenced below are destructive (drain, reboot, replace, GPU stress). Each is flagged `WARNING — destructive`. Before executing any flagged command, get the user's explicit confirmation in the session. Approval for one destructive step does **not** carry over to the next — confirm each one separately.
+
+---
+
+## Step 1: Classify the Symptom
+
+Ask the user what they're actually seeing, then match to one of the three categories:
+
+| What the user reports                                                          | Go to                                   |
+| ------------------------------------------------------------------------------ | --------------------------------------- |
+| Training is faster on some node sets than others / variance across allocations | **[A](#a-uneven-nccl-performance)**     |
+| One specific node is slow (straggler) at AllReduce                             | **[A](#a-uneven-nccl-performance)**     |
+| Placement-group concerns, cross-AZ latency suspicions                          | **[A](#a-uneven-nccl-performance)**     |
+| DataLoader stalls, epoch time bloats, checkpoint saves very slow               | **[B](#b-poor-filesystem-performance)** |
+| `iostat`/`dstat` shows disk wait high, FSx dashboards red                      | **[B](#b-poor-filesystem-performance)** |
+| `nvidia-smi` shows errors, CUDA errors in logs, NaN loss, Xid in dmesg         | **[C](#c-suspected-gpu-failure)**       |
+| Thermal throttling (temp > 85°C), ECC error counts climbing                    | **[C](#c-suspected-gpu-failure)**       |
+
+If the user can't classify, run the one-shot snapshot — it prints all three signals and tells you which section to jump to:
+
+```bash
+bash scripts/perf-snapshot.sh --cluster <NAME> --region <REGION>
+# Scope to one suspect node:
+bash scripts/perf-snapshot.sh --cluster <NAME> --region <REGION> --node <INSTANCE_ID>
+```
+
+The script is read-only. It may emit color codes; pass `--no-color` or pipe to a file to disable.
+
+---
+
+## A: Uneven NCCL Performance
+
+**Signals:** Identical training job has different step times on different node sets. Bandwidth varies across node pairs. Some jobs consistently slower than others despite same code.
+
+**Root causes (in order of frequency):**
+
+1. Network topology differences — nodes not in the same placement group / UltraCluster, or spread across AZs
+2. Degraded EFA on one or more nodes — driver, OFI-NCCL version, or hardware
+3. Instance-type mix within an instance group (e.g., `p5.48xlarge` and `p5e.48xlarge` together)
+4. CPU frequency scaling / GPU-EFA affinity differences
+
+### Diagnostic pass (on each suspect node, via SSM)
+
+```bash
+# Topology — GPU↔EFA mapping. Look for NODE/SYS/PIX inconsistencies across nodes:
+nvidia-smi topo -m
+
+# EFA device count and versions. All nodes must match:
+fi_info -p efa | grep -E 'provider|version|fabric' | sort -u
+lsmod | grep -iE 'efa|ib_'
+cat /opt/amazon/efa/share/VERSION 2>/dev/null
+
+# Instance type (IMDSv2 required on HyperPod DLAMIs):
+TOKEN=$(curl -s -X PUT http://169.254.169.254/latest/api/token \
+  -H "X-aws-ec2-metadata-token-ttl-seconds: 60")
+curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+  http://169.254.169.254/latest/meta-data/instance-type
+
+# CPU governor (should be 'performance' for training):
+cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
+```
+
+### Pairwise NCCL bandwidth test
+
+Use `nccl-tests` from [aws-samples/awsome-distributed-training](https://github.com/aws-samples/awsome-distributed-training/tree/main/micro-benchmarks/nccl-tests). For an N-node cluster, run all-reduce across every pair and record `busbw` for each pair. Any pair more than ~10% below the cluster median points to the node(s) common to the slow pairs.
+
+Expected `busbw` (8-GPU node, message sizes ≥ 1 GiB) per the [AWS AI-on-HyperPod NCCL guide](https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/slurm-orchestration/validation-and-testing/performance-testing/nccl-tests):
+
+| Instance      | Expected busbw | Warning (< 10% below) | Failure (> 20% below) |
+| ------------- | -------------- | --------------------- | --------------------- |
+| p4d.24xlarge  | ~300 GB/s      | < 270 GB/s            | < 240 GB/s            |
+| p4de.24xlarge | ~300 GB/s      | < 270 GB/s            | < 240 GB/s            |
+| p5.48xlarge   | ~400+ GB/s     | < 360 GB/s            | < 320 GB/s            |
+| p5e.48xlarge  | ~400+ GB/s     | < 360 GB/s            | < 320 GB/s            |
+| p5en.48xlarge | ~400+ GB/s     | < 360 GB/s            | < 320 GB/s            |
+| p6e.48xlarge  | ~400+ GB/s     | < 360 GB/s            | < 320 GB/s            |
+
+Trainium (`trn1.32xlarge`, `trn2.48xlarge`) does **not** run NCCL — use NCCOM via the Neuron SDK instead. See the [NCCOM testing guide](https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/slurm-orchestration/validation-and-testing/performance-testing/nccom-tests).
+
+### Fix path
+
+Once the outlier node is isolated:
+
+- **Replace the bad node.** See the destructive-action warning below.
+  Delegate the full replacement flow to `hyperpod-node-debugger` § F (Hardware / Auto-Repair). It covers `batch-reboot-cluster-nodes` (try first — non-destructive) and `batch-replace-cluster-nodes` (destructive — terminates the instance and wipes all local volumes).
+
+- **Systemic topology issues** (no single bad node, but spread-out placement):
+  - Verify all nodes are in the same placement group — check `Placement.GroupName` via `describe-instances`.
+  - Confirm all nodes share the same AZ — check `Placement.AvailabilityZone`.
+  - If they don't, the cluster was provisioned without placement constraints; you cannot fix this in place. Recreate using Flexible Training Plans or reserved capacity to force co-location. See [hyperpod-cluster-debugger → capacity-planning.md](../hyperpod-cluster-debugger/references/capacity-planning.md).
+
+### Delegate
+
+| Situation                                           | Use                                               |
+| --------------------------------------------------- | ------------------------------------------------- |
+| NCCL timeout / hang / AllReduce stuck               | `hyperpod-nccl` skill                             |
+| EFA/NCCL/CUDA version drift across nodes            | `hyperpod-version-checker` skill                  |
+| Training-wide MFU degradation is the actual symptom | `hyperpod-mfu-debugger` skill (Phase 2F: network) |
+| Node replacement required                           | `hyperpod-node-debugger` § F                      |
+
+See [references/perf-details.md → Uneven NCCL](references/perf-details.md#uneven-nccl) for the pairwise-test scripts, busbw thresholds, and the UltraCluster placement-group check.
+
+---
+
+## B: Poor Filesystem Performance
+
+**Signals:** Training bottlenecked on data loading, checkpoint save/load dominates step time, executables/scripts load slowly, `iowait` high in `top`/`dstat`.
+
+### Diagnostic pass
+
+**1. CloudWatch metrics** — the authoritative signal for FSx. Compare actual throughput against provisioned.
+
+For each filesystem, list and pull recent throughput. Scope to filesystems actually attached to the cluster by inspecting on-node mounts first, not `describe-file-systems` unfiltered (which enumerates every FSx in the region):
+
+```bash
+# On a cluster node, via SSM — list actually-mounted FSx filesystems:
+mount | grep -E 'lustre|zfs' | awk '{print $3}'
+```
+
+Then describe just those IDs:
+
+```bash
+aws fsx describe-file-systems --region <REGION> --file-system-ids <FSID1> <FSID2> \
+  --query 'FileSystems[*].{Id:FileSystemId,Type:FileSystemType,CapacityGiB:StorageCapacity}'
+```
+
+Lustre and OpenZFS both emit `DataReadBytes` / `DataWriteBytes` (Bytes, Sum) in the `AWS/FSx` namespace:
+
+```bash
+aws cloudwatch get-metric-statistics --region <REGION> \
+  --namespace AWS/FSx --metric-name DataReadBytes \
+  --dimensions Name=FileSystemId,Value=<FSID> \
+  --start-time "$(date -u -d '3 hours ago' +%Y-%m-%dT%H:%M:%S)" \
+  --end-time   "$(date -u +%Y-%m-%dT%H:%M:%S)" \
+  --period 60 --statistics Sum Maximum
+```
+
+For saturation (%), OpenZFS exposes `FileServerDiskThroughputUtilization` and `FileServerDiskIopsUtilization` directly; Lustre exposes `DiskIopsUtilization`. See [references/perf-details.md → Filesystem](references/perf-details.md#filesystem) for the full per-filesystem catalog.
+
+**2. On-node I/O inspection** (via `hyperpod-ssm`):
+
+```bash
+iostat -xz 1 30                       # per-device I/O over 30s (unprivileged, DLAMI default)
+sudo iotop -ao -b -n 30 -d 1          # per-process I/O over 30s (requires sudo)
+df -h /fsx /opt/dlami/nvme /opt/sagemaker 2>/dev/null
+for mnt in $(mount | awk '/lustre/ {print $3}'); do lfs df -h "$mnt"; done
+```
+
+### Fix path
+
+Ask: **is the workload legitimately demanding more, or is the I/O pattern inefficient?**
+
+**Provisioned capacity saturated → scale up:**
+
+- FSx for Lustre: throughput scales with `StorageCapacity × PerUnitStorageThroughput`. Grow the filesystem (online, rebalances automatically) or upgrade `PerUnitStorageThroughput` for `PERSISTENT_2` deployments.
+- FSx for OpenZFS: increase provisioned IOPS and/or throughput capacity.
+- EBS: upgrade volume type (gp3 → io2) or bump provisioned IOPS/throughput.
+
+**I/O pattern inefficient → fix the app** (most common):
+
+- **Dataloader:** raise `num_workers`, `pin_memory=True`, `persistent_workers=True`, preload to RAM when it fits. Detailed tuning → [hyperpod-mfu-debugger → optimization-guide.md (data pipeline)](../hyperpod-mfu-debugger/references/optimization-guide.md).
+- **Checkpointing:** use async + sharded checkpoints with `torch.distributed.checkpoint.async_save` + FSDP `SHARDED_STATE_DICT`. Canonical pattern in [references/perf-details.md → async checkpoint](references/perf-details.md#async-checkpoint-pattern-pytorch). Requires PyTorch ≥ 2.4.
+- **Small-file workloads:** Lustre is optimized for large sequential I/O. For millions of small files, use WebDataset / tar shards, FSx for OpenZFS, or NVMe scratch.
+
+**Choose the right filesystem for the pattern** — see [references/perf-details.md → filesystem selection](references/perf-details.md#filesystem-selection-by-pattern).
+
+### Delegate
+
+| Situation                                                | Use                                                                           |
+| -------------------------------------------------------- | ----------------------------------------------------------------------------- |
+| Data pipeline tuning (dataloader, prefetch, shuffling)   | `hyperpod-mfu-debugger` → its `optimization-guide.md` (data pipeline section) |
+| Periodic dips tied to checkpoint cadence                 | `hyperpod-mfu-debugger` → its `steps/2e-periodic.md`                          |
+| Shell access on nodes (`iotop`, `lfs df`)                | `hyperpod-ssm` skill                                                          |
+| Diagnostic bundle for AWS Support (FSx metrics included) | `hyperpod-issue-report` skill                                                 |
+
+---
+
+## C: Suspected GPU Failure
+
+**Signals:** CUDA errors, training produces NaN, GPU memory allocation failures, Xid in `dmesg`, temperature sustained > 85°C, ECC error counters climbing, crashes during GPU-intensive phases.
+
+> **Warning — `dcgmi diag -r 3` and `gpu_burn` are destructive during training.**
+> Both sustain 100% GPU load for minutes-to-hours and will disrupt any co-resident training job. Run only on a node that has been drained from the scheduler. If you haven't drained yet, skip these and start with the read-only queries below.
+
+### Diagnostic pass (suspect node, via SSM)
+
+Read-only checks — safe to run while training continues:
+
+```bash
+nvidia-smi                                                              # all GPUs visible?
+nvidia-smi -q | grep -E 'ECC|Uncorrectable|Volatile|Aggregate' -A 2
+dmesg -T 2>/dev/null | grep -iE 'xid|nvrm|pcie|ecc' | tail -30
+nvidia-smi dmon -s pucvmet -c 30                                        # 30s live metrics
+```
+
+Stress diagnostics — require a drained node (see Fix path step 1):
+
+```bash
+sudo dcgmi diag -r 3                                                    # ~15 min, definitive
+```
+
+### What each finding means
+
+| Signal                                            | Severity      | Action                                                         |
+| ------------------------------------------------- | ------------- | -------------------------------------------------------------- |
+| Xid 13, 31, 43, 45                                | Info / app    | Often app-side; investigate training code before replacing     |
+| Xid 48, 62, 74, 79, 92, 94, 95                    | **Hardware**  | Drain + replace                                                |
+| Xid 63, 64 (ECC page retirement)                  | **Hardware**  | Drain + replace if rate climbing; monitor otherwise            |
+| ECC **uncorrectable** > 0 (volatile or aggregate) | **Hardware**  | Drain + replace                                                |
+| ECC corrected > 1000/day and climbing             | Watch         | Schedule replacement before it escalates                       |
+| GPU temp sustained > 85°C with throttling         | Environmental | Check airflow; replace if persistent                           |
+| `nvidia-smi` missing a GPU                        | **Hardware**  | Drain + replace                                                |
+| DCGM `-r 3` fails any test                        | **Hardware**  | Drain + replace; attach DCGM output to support case            |
+| NaN loss but diagnostics clean                    | Software      | Not GPU — check mixed-precision / dtype config / learning rate |
+
+Full Xid catalog, ECC thresholds, and DCGM level details in [references/perf-details.md → GPU](references/perf-details.md#gpu).
+
+Trainium/Inferentia: use `neuron-ls`, `neuron-top`, `neuron-monitor` instead of `nvidia-smi`; delegate to `hyperpod-node-debugger` § G (Trainium section).
+
+### Fix path (hardware confirmed)
+
+**1. Drain so no new jobs land** (non-destructive — the node keeps existing state):
+
+```bash
+# Slurm — delegate to hyperpod-slurm-debugger for full drain/reboot/resume workflow
+scontrol update NodeName=<N> State=drain Reason="GPU hardware failure - DCGM -r 3 failed"
+
+# EKS — NOTE: --delete-emptydir-data permanently deletes emptyDir volumes on evicted pods
+kubectl cordon <NODE>
+kubectl drain <NODE> --ignore-daemonsets --delete-emptydir-data
+```
+
+**Warning:** `kubectl drain --delete-emptydir-data` destroys any scratch data in pod `emptyDir` volumes on that node. Confirm the user has no unpersisted results before running.
+
+**2. Collect evidence for AWS Support** (read-only) — GPU serial, ECC counts, DCGM output, dmesg Xid lines. Use `hyperpod-issue-report` to bundle.
+
+**3. Validate it's really hardware with `dcgmi diag -r 3`** (now safe because the node is drained). If all tests pass, the fault is unlikely to be GPU — re-check with `hyperpod-node-debugger` before paying for a replacement.
+
+**4. Replace the node — destructive.** All data on instance volumes is permanently lost:
+
+Delegate the replacement to `hyperpod-node-debugger` § F. That section covers:
+
+- The `UpdateClusterSoftware` prerequisite that must run first on any pre-patch cluster, or `BatchReplaceClusterNodes` will fail.
+- The exact `batch-reboot-cluster-nodes` → `batch-replace-cluster-nodes` ordering (reboot first; replace only if reboot doesn't clear).
+- The data-loss warning: `BatchReplaceClusterNodes` **terminates the instance and destroys all EBS root and secondary volumes**. Back up `/opt/sagemaker` to S3 or FSx before calling. `/opt/dlami/nvme` is instance-store and ephemeral anyway.
+- The CLI syntax: `--node-ids` takes **space-separated** instance IDs (`i-0abc... i-0def...`), not a JSON array.
+
+**5. Validate the replacement before returning to the pool** — `dcgmi diag -r 3` + a short `gpu_burn` run on the drained replacement. See [references/perf-details.md → pre-return validation](references/perf-details.md#pre-return-validation-for-a-replaced-node).
+
+### Delegate
+
+| Situation                                            | Use                              |
+| ---------------------------------------------------- | -------------------------------- |
+| Broader node triage (lifecycle, VPC, not just GPU)   | `hyperpod-node-debugger` skill   |
+| Full replacement flow + auto-repair debugging        | `hyperpod-node-debugger` § F     |
+| Slurm drain / reboot / resume workflow               | `hyperpod-slurm-debugger` skill  |
+| Trainium (trn1/trn2) — `neuron-ls`, not `nvidia-smi` | `hyperpod-node-debugger` § G     |
+| Evidence bundle for AWS Support                      | `hyperpod-issue-report` skill    |
+| Version mismatch (driver/CUDA/NCCL) suspected        | `hyperpod-version-checker` skill |
+
+## Escalate to AWS Support When
+
+1. Pairwise NCCL tests isolate a node, replacement passes DCGM, but bandwidth is still uneven — likely an UltraCluster-level network issue.
+2. FSx for Lustre shows sustained throughput saturation despite being at the highest per-TiB throughput tier — may need a service quota increase or architecture change.
+3. DCGM `-r 3` passes but training still NaNs or crashes on that GPU — silent-data-corruption case; collect evidence with `hyperpod-issue-report`.
+
+## References
+
+- [references/perf-details.md](references/perf-details.md) — pairwise NCCL test scripts with busbw thresholds; CloudWatch metric catalog per filesystem type with async-checkpoint patterns; Xid code catalog, ECC thresholds, DCGM test suites, and the full GPU replacement playbook.
+- [AWS AI-on-HyperPod NCCL tests](https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/slurm-orchestration/validation-and-testing/performance-testing/nccl-tests) — canonical busbw expectations and topology-aware test scripts.
+- [BatchReplaceClusterNodes API](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_BatchReplaceClusterNodes.html) — data-loss warning and `UpdateClusterSoftware` prerequisite.
+- [NVIDIA Xid errors](https://docs.nvidia.com/deploy/xid-errors/) — upstream severity guidance.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/references/perf-details.md b/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/references/perf-details.md
new file mode 100644
index 00000000..9bf174bf
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/references/perf-details.md
@@ -0,0 +1,250 @@
+# Performance Details
+
+Supplementary detail for `hyperpod-performance-debugger`. Each section corresponds to one of the three top-level categories.
+
+---
+
+## Uneven NCCL
+
+### Pairwise NCCL all-reduce test
+
+Use the `nccl-tests` container from [aws-samples/awsome-distributed-training](https://github.com/aws-samples/awsome-distributed-training/tree/main/micro-benchmarks/nccl-tests). The repo ships `slurm/nccl-tests-container.sbatch` and `slurm/topology-aware-nccl-tests/` with pairwise sweeps pre-wired. For an N-node cluster, run all-reduce across every pair and record `busbw` for each pair. Any pair more than ~20% below the cluster median points to the node(s) common to the slow pairs.
+
+The topology-aware submit script in that repo uses `sbatch --array` to fan out pairwise jobs and the `process_nccl_results.sh` helper to flag outliers (default threshold 5% deviation from the run median).
+
+Manual single-pair run (Slurm):
+
+```bash
+sbatch -N 2 -w <NODE_A>,<NODE_B> nccl-tests-container.sbatch
+```
+
+Aggregate run across all N nodes (from the DLAMI-preinstalled NCCL tests path):
+
+```bash
+# Path varies by DLAMI version; prefer the repo's sbatch file over hardcoded paths.
+srun -N <N> --mpi=pmix /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 8
+```
+
+### Expected bandwidth (8-GPU all-reduce, message sizes ≥ 1 GiB)
+
+Per the [AWS AI-on-HyperPod NCCL guide](https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/slurm-orchestration/validation-and-testing/performance-testing/nccl-tests#performance-benchmarks):
+
+| Instance      | Network bandwidth  | Expected busbw | Warning threshold | Failure threshold |
+| ------------- | ------------------ | -------------- | ----------------- | ----------------- |
+| p4d.24xlarge  | 400 Gbps           | ~300 GB/s      | < 270 GB/s        | < 240 GB/s        |
+| p4de.24xlarge | 400 Gbps           | ~300 GB/s      | < 270 GB/s        | < 240 GB/s        |
+| p5.48xlarge   | 3,200 Gbps (EFAv2) | ~400+ GB/s     | < 360 GB/s        | < 320 GB/s        |
+| p5e.48xlarge  | 3,200 Gbps (EFAv2) | ~400+ GB/s     | < 360 GB/s        | < 320 GB/s        |
+| p5en.48xlarge | 3,200 Gbps (EFAv3) | ~400+ GB/s     | < 360 GB/s        | < 320 GB/s        |
+| p6e.48xlarge  | 3,200 Gbps         | ~400+ GB/s     | < 360 GB/s        | < 320 GB/s        |
+
+> **Trainium**: `trn1.32xlarge` and `trn2.48xlarge` do **not** run NCCL — they run `NCCOM` tests via the Neuron SDK. See the [NCCOM testing guide](https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/slurm-orchestration/validation-and-testing/performance-testing/nccom-tests). The "expected busbw" column above does not apply; use the `nccom-test` binaries and compare against Neuron SDK docs.
+
+The AWS pairwise-validation script (`validate_performance.sh` in the NCCL guide) flags any pair deviating more than 5% from the run mean — that's the tight threshold for production validation. The 20% number here is the coarser "drain-and-replace" threshold: if a pair is 20%+ below median, the node common to slow pairs is the prime suspect.
+
+### UltraCluster / placement group check
+
+```bash
+for id in $(aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+             --query 'ClusterNodeSummaries[*].InstanceId' --output text); do
+  aws ec2 describe-instances --instance-ids $id --region <R> \
+    --query 'Reservations[0].Instances[0].{ID:InstanceId,AZ:Placement.AvailabilityZone,PG:Placement.GroupName,Subnet:SubnetId}' \
+    --output text
+done
+```
+
+All nodes that need fast all-reduce together must share the same `PG` and `AZ`. If they don't, the cluster was provisioned without placement constraints — you cannot fix this in place. Use Flexible Training Plans or reserved capacity to force co-location (see [hyperpod-cluster-debugger → capacity-planning.md](../../hyperpod-cluster-debugger/references/capacity-planning.md)).
+
+### EFA version consistency
+
+All nodes in the training group must run identical EFA and OFI-NCCL versions. Use `hyperpod-version-checker` to compare. Any mismatch is sufficient to degrade pairwise bandwidth 10–30%.
+
+---
+
+## Filesystem
+
+### CloudWatch metrics per filesystem type
+
+All metrics live in the `AWS/FSx` namespace (not `AWS/EBS`). Dimension: `FileSystemId`.
+
+#### FSx for Lustre (`FileSystemType: LUSTRE`)
+
+| Metric                    | What it means                                | Statistic |
+| ------------------------- | -------------------------------------------- | --------- |
+| `DataReadBytes`           | Aggregate read throughput (Bytes)            | Sum       |
+| `DataWriteBytes`          | Aggregate write throughput (Bytes)           | Sum       |
+| `MetadataOperations`      | File-open, stat, readdir rate (Count)        | Sum       |
+| `FreeDataStorageCapacity` | Remaining bytes — low values throttle writes | Minimum   |
+| `DiskIopsUtilization`     | % of provisioned IOPS in use (Percent)       | Maximum   |
+
+Lustre throughput scales as `StorageCapacity_TiB × PerUnitStorageThroughput_MBps`. To scale up, grow capacity or upgrade per-unit throughput (`PERSISTENT_2` only). Capacity changes are non-disruptive.
+
+#### FSx for OpenZFS (`FileSystemType: OPENZFS`)
+
+| Metric                                       | What it means                              | Statistic        |
+| -------------------------------------------- | ------------------------------------------ | ---------------- |
+| `DataReadBytes` / `DataWriteBytes`           | Aggregate throughput (Bytes)               | Sum              |
+| `DataReadOperations` / `DataWriteOperations` | Client IOPS (Count)                        | Sum              |
+| `NetworkThroughputUtilization`               | % of provisioned network throughput in use | Average, Maximum |
+| `FileServerDiskIopsUtilization`              | % of disk IOPS in use                      | Average, Maximum |
+| `FileServerDiskThroughputUtilization`        | % of disk throughput in use                | Average, Maximum |
+| `CPUUtilization`                             | File server CPU %                          | Average, Maximum |
+
+The utilization metrics (percent) are the authoritative saturation signals. There is **no** `ReadIOPS` metric in `AWS/FSx` — that's an EBS metric. Source: [FSx for OpenZFS metrics](https://docs.aws.amazon.com/fsx/latest/OpenZFSGuide/fsx-openzfs-metrics.html).
+
+#### EBS (`AWS/EBS` namespace — separate from FSx)
+
+Use `VolumeReadOps`, `VolumeWriteOps`, `VolumeQueueLength`. A sustained `VolumeQueueLength > 1` means the volume is the bottleneck. For gp3, also compare against the provisioned IOPS/throughput (set when creating the volume, not scaled automatically).
+
+### NVMe sizes per instance
+
+Source: [EC2 P5 product page](https://aws.amazon.com/ec2/instance-types/p5/), [EC2 P4 product page](https://aws.amazon.com/ec2/instance-types/p4/).
+
+| Instance      | NVMe devices | Mount             | Total raw |
+| ------------- | ------------ | ----------------- | --------- |
+| p4d.24xlarge  | 8 × 1 TB     | `/opt/dlami/nvme` | ~8 TB     |
+| p4de.24xlarge | 8 × 1 TB     | `/opt/dlami/nvme` | ~8 TB     |
+| p5.48xlarge   | 8 × 3.84 TB  | `/opt/dlami/nvme` | ~30.7 TB  |
+| p5e.48xlarge  | 8 × 3.84 TB  | `/opt/dlami/nvme` | ~30.7 TB  |
+| p5en.48xlarge | 8 × 3.84 TB  | `/opt/dlami/nvme` | ~30.7 TB  |
+| p6e.48xlarge  | 8 × 3.84 TB  | `/opt/dlami/nvme` | ~30.7 TB  |
+
+NVMe is **ephemeral** — data is lost on stop, replace, or hardware failure. Use for scratch, not persistent state.
+
+### Async checkpoint pattern (PyTorch)
+
+Requires PyTorch ≥ 2.4. The `torch.distributed.checkpoint.async_save` API stages the state dict to CPU (default), starts the save in a background thread, and returns a future the training loop can continue past. See the [PyTorch DCP documentation](https://docs.pytorch.org/docs/stable/distributed.checkpoint.html).
+
+```python
+import torch.distributed.checkpoint as dcp
+from torch.distributed.checkpoint import FileSystemWriter
+
+# FSDP users: use SHARDED_STATE_DICT, not FULL_STATE_DICT.
+# FULL_STATE_DICT gathers to rank 0 and serializes every write through one node.
+
+state = {"model": model.state_dict(), "optimizer": optim.state_dict()}
+storage_writer = FileSystemWriter(f"{ckpt_dir}/step_{step}")
+
+# Non-blocking: training continues immediately.
+fut = dcp.async_save(
+    state,
+    checkpoint_id=f"{ckpt_dir}/step_{step}",
+    storage_writer=storage_writer,
+)
+
+# Block only when you must — e.g. before job exit or before the next save.
+fut.result()
+```
+
+Notes:
+
+- `async_save` returns either a bare future or an `AsyncSaveResponse` (two-future form: `.staging_completion` + `.upload_completion`) depending on the staging configuration. The one-future form above works for the default `BlockingAsyncStager`.
+- On shared FSx Lustre, stripe your checkpoint directory across multiple OSTs with `lfs setstripe` before the first save; this alone often doubles checkpoint throughput.
+- For very large models (tens of billions of params), set the writer's `thread_count` based on the rank count per node.
+
+### Filesystem selection by pattern
+
+| Pattern                       | Best fit                               | Why                                     |
+| ----------------------------- | -------------------------------------- | --------------------------------------- |
+| Large sequential I/O          | FSx for Lustre                         | Striping scales with OSTs               |
+| Small random I/O, mixed reads | FSx for OpenZFS                        | POSIX + better small-file performance   |
+| Temporary high-perf scratch   | NVMe (`/opt/dlami/nvme`)               | 30+ GB/s aggregate, zero network        |
+| Single-node persistent        | EBS gp3 / io2 (`/opt/sagemaker`)       | 100 GB root is too small; EBS is sized  |
+| Datasets (cold + warm)        | S3 + Mountpoint-S3 for streaming reads | Scales infinitely, no provisioned limit |
+
+---
+
+## GPU
+
+### Xid error catalog (common codes)
+
+Source: [NVIDIA Xid errors](https://docs.nvidia.com/deploy/xid-errors/). When in doubt, treat Xid ≥ 48 as hardware until proven otherwise by DCGM `-r 3` passing.
+
+| Xid    | Meaning                           | Severity     | Action                                              |
+| ------ | --------------------------------- | ------------ | --------------------------------------------------- |
+| 13     | GR: SW notify / user app fault    | App          | Usually a CUDA kernel bug; inspect training code    |
+| 31     | MMU fault (bad memory access)     | App          | Usually user bug; monitor                           |
+| 43     | GPU stopped processing            | App → HW     | Restart job; if recurring on the same node, replace |
+| 45     | Preemptive cleanup                | Info         | Benign                                              |
+| 48     | Double-bit ECC error              | **Hardware** | Drain + replace                                     |
+| 62     | Internal micro-controller halt    | **Hardware** | Drain + replace                                     |
+| 63, 64 | ECC page retirement               | **Hardware** | Drain + replace if rate climbing                    |
+| 74     | NVLink error                      | **Hardware** | Drain + replace                                     |
+| 79     | GPU fell off the bus              | **Hardware** | Drain immediately                                   |
+| 92     | GPU exhausted                     | **Hardware** | Drain + replace                                     |
+| 94, 95 | Contained / uncontained ECC error | **Hardware** | Drain + replace                                     |
+
+### ECC error-rate thresholds
+
+Track both `volatile` (since driver load) and `aggregate` (lifetime). The `aggregate.total` field survives reboots; rely on it for long-lived clusters.
+
+```bash
+nvidia-smi --query-gpu=index,ecc.errors.uncorrected.volatile.total,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.volatile.total \
+  --format=csv,noheader
+```
+
+| Rate                                      | Interpretation                                |
+| ----------------------------------------- | --------------------------------------------- |
+| Corrected < 100/day (volatile)            | Normal background                             |
+| Corrected 100–1000/day (volatile)         | Watch — may be environmental or early failure |
+| Corrected > 1000/day (volatile)           | Pre-failure; schedule replacement             |
+| Uncorrectable (volatile or aggregate) > 0 | **Failed** — drain and replace now            |
+
+### DCGM diag levels
+
+| Level  | Command              | Duration | Scope                               | Safe during training? |
+| ------ | -------------------- | -------- | ----------------------------------- | --------------------- |
+| `-r 1` | Short                | ~30 sec  | Sanity — device present, responsive | Yes                   |
+| `-r 2` | Medium               | ~2 min   | Adds memory & PCIe tests            | No — moderate load    |
+| `-r 3` | Long (comprehensive) | ~15 min  | Full stress including SM & memory   | No — 100% load        |
+| `-r 4` | Extended             | ~1 hour  | Long-duration stress                | No — 100% load        |
+
+Use `-r 3` for replacement decisions. Record the output — AWS Support will want it. **Must be run on a drained node.**
+
+### GPU stress / burn test
+
+`gpu-burn` is a third-party CUDA stress test that sustains ~100% GPU utilization for a fixed duration and reports errors. Use it only as a final post-replacement verification step, and only on a drained node.
+
+Prefer `dcgmi diag -r 3` whenever DCGM is available on the node — it returns structured pass/fail per sub-test and is the output AWS Support requires. Fall back to `gpu-burn` only when DCGM is unavailable, or when the user explicitly asks for a second signal.
+
+**Supply-chain posture.** `gpu-burn` is not preinstalled on HyperPod DLAMIs and is not an AWS-signed artifact. Do not auto-clone and build it at runtime. Before invoking `gpu-burn`, confirm one of the following, in this order of preference:
+
+1. **`gpu_burn` is already present on the node** at a known path (e.g. baked into the cluster's operator image). Use the existing binary. Do not rebuild.
+2. **The user's account has a pinned, trusted artifact** — an S3 object, internal package-repo entry, or OCI image tag they have approved for this workflow. Fetch from there.
+3. **The user explicitly asks to build `gpu-burn` from source for a one-off diagnostic.** In that case:
+   - Ask for a specific commit SHA or release tag from <https://github.com/wilicc/gpu-burn>. Refuse to build from `HEAD` or an unpinned branch.
+   - Show the commit diff against the previous known-good tag and the Makefile contents, and wait for the user to approve before building.
+   - Build in an isolated working directory on the target node. Record the resulting binary's SHA-256 and report it back.
+   - Do not persist the binary outside that directory unless the user approves mirroring it to their trusted store.
+
+If none of (1), (2), or (3) apply, skip `gpu-burn` and rely on `dcgmi diag -r 3` alone. Never silently install or compile third-party binaries on a production node.
+
+Once a trusted binary is present on the drained node, the invocation is `./gpu_burn 600` for a 10-minute burn. A healthy GPU completes without errors and without clock dips beyond brief transients. Any failure → drain and replace.
+
+### Pre-return validation for a replaced node
+
+Before returning a replaced node to the scheduling pool, confirm it is healthy. Execute the following on the drained replacement via SSM and report results back to the user:
+
+```bash
+# Required checks:
+nvidia-smi                                                   # all 8 GPUs visible
+nvidia-smi -q | grep -A2 "ECC Errors"                        # zero uncorrectable
+sudo dcgmi diag -r 3                                         # all tests pass
+fi_info -p efa | head -20                                    # EFA devices present; compare to healthy peer
+```
+
+Optional additional signal: `./gpu_burn 300` (5-minute burn). Only run this if a trusted `gpu_burn` binary is already available on the node per the supply-chain posture above. If no trusted binary is present, skip it — do not clone, download, or compile `gpu-burn` to satisfy pre-return validation.
+
+If everything passes, return to the pool:
+
+```bash
+# Slurm:
+scontrol update NodeName=<N> State=resume
+# EKS:
+kubectl uncordon <NODE>
+```
+
+### Complete replacement playbook
+
+See `hyperpod-node-debugger` § F for the authoritative flow, and the AI-on-HyperPod GPU stress-testing guide:
+<https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/validation-and-testing/performance-testing/gpu-stress-testing>
diff --git a/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/scripts/perf-snapshot.sh b/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/scripts/perf-snapshot.sh
new file mode 100755
index 00000000..a786c874
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/scripts/perf-snapshot.sh
@@ -0,0 +1,346 @@
+#!/usr/bin/env bash
+# perf-snapshot.sh
+#
+# One-shot performance classifier for SageMaker HyperPod. Runs the minimal
+# diagnostic pass for all three categories (uneven NCCL topology, filesystem
+# saturation, GPU health) and prints which section of the parent SKILL.md to
+# jump to.
+#
+# Read-only. Never drains, cordons, replaces, or restarts anything. Never
+# prints remediation commands. Only invokes stress tests (dcgmi, gpu_burn) if
+# you explicitly run them separately after draining a node.
+#
+# Usage:
+#   bash perf-snapshot.sh --cluster <NAME|ARN> --region <REGION>
+#   bash perf-snapshot.sh --cluster <N> --region <R> --node <INSTANCE_ID>
+#   bash perf-snapshot.sh --cluster <N> --region <R> --no-color > report.txt
+#
+# Required IAM (on the calling principal):
+#   sagemaker:DescribeCluster, sagemaker:ListClusterNodes
+#   ec2:DescribeInstances
+#   fsx:DescribeFileSystems
+#   cloudwatch:GetMetricStatistics
+#   ssm:StartSession, ssm:TerminateSession
+#
+# Prerequisites on the calling machine:
+#   aws CLI v2, jq, session-manager-plugin (for the SSM calls)
+
+set -uo pipefail
+
+# ---------------------------------------------------------------------------
+# Argument parsing
+# ---------------------------------------------------------------------------
+CLUSTER=""
+REGION="${AWS_DEFAULT_REGION:-us-east-1}"
+TARGET_NODE=""
+NO_COLOR="${NO_COLOR:-}"
+
+usage() {
+  sed -n '2,25p' "$0" | sed 's/^# \{0,1\}//'
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cluster)   CLUSTER="${2:-}";     shift 2 ;;
+    --region)    REGION="${2:-}";      shift 2 ;;
+    --node)      TARGET_NODE="${2:-}"; shift 2 ;;
+    --no-color)  NO_COLOR=1;           shift 1 ;;
+    -h|--help)   usage; exit 0 ;;
+    *) echo "Unknown arg: $1" >&2; usage; exit 2 ;;
+  esac
+done
+
+# ---------------------------------------------------------------------------
+# Input validation — these values flow into AWS API calls and SSM payloads.
+# ---------------------------------------------------------------------------
+[[ -z "$CLUSTER" ]] && { echo "Error: --cluster required" >&2; exit 2; }
+
+# Cluster name or ARN (see AWS SageMaker BatchReplaceClusterNodesRequest pattern)
+if ! [[ "$CLUSTER" =~ ^(arn:aws[a-z-]*:sagemaker:[a-z0-9-]*:[0-9]{12}:cluster/[a-z0-9]{12})$|^[a-zA-Z0-9][-a-zA-Z0-9]{0,62}$ ]]; then
+  echo "Error: invalid cluster name or ARN: $CLUSTER" >&2
+  exit 2
+fi
+
+# Region
+if ! [[ "$REGION" =~ ^[a-z]{2}-[a-z]+-[0-9]{1,2}$ ]]; then
+  echo "Error: invalid region: $REGION" >&2
+  exit 2
+fi
+
+# Optional node — EC2 instance ID
+if [[ -n "$TARGET_NODE" ]] && ! [[ "$TARGET_NODE" =~ ^i-[a-f0-9]{8,17}$ ]]; then
+  echo "Error: invalid --node (expected i-<hex>): $TARGET_NODE" >&2
+  exit 2
+fi
+
+# Dependency check
+for cmd in aws jq; do
+  command -v "$cmd" >/dev/null 2>&1 || { echo "Error: '$cmd' is required" >&2; exit 2; }
+done
+if ! command -v session-manager-plugin >/dev/null 2>&1; then
+  echo "Warning: session-manager-plugin not found; on-node probes will fail" >&2
+fi
+
+# ---------------------------------------------------------------------------
+# Output helpers (TTY-gated; respect NO_COLOR)
+# ---------------------------------------------------------------------------
+if [[ -t 1 ]] && [[ -z "$NO_COLOR" ]]; then
+  RED=$'\033[0;31m'; GREEN=$'\033[0;32m'; YELLOW=$'\033[1;33m'
+  CYAN=$'\033[0;36m'; BOLD=$'\033[1m';    NC=$'\033[0m'
+else
+  RED=""; GREEN=""; YELLOW=""; CYAN=""; BOLD=""; NC=""
+fi
+
+section() { printf "\n${BOLD}${CYAN}== %s ==${NC}\n" "$1"; }
+ok()      { printf "  ${GREEN}[OK  ]${NC} %s\n" "$1"; }
+warn()    { printf "  ${YELLOW}[WARN]${NC} %s\n" "$1"; }
+bad()     { printf "  ${RED}[BAD ]${NC} %s\n" "$1"; }
+info()    { printf "         %s\n" "$1"; }
+
+HINTS=()
+
+# ---------------------------------------------------------------------------
+# Cluster + node list
+# ---------------------------------------------------------------------------
+DESC=$(aws sagemaker describe-cluster --cluster-name "$CLUSTER" --region "$REGION" --output json 2>&1) \
+  || { echo "Error: describe-cluster failed: $DESC" >&2; exit 3; }
+CLUSTER_ID=$(echo "$DESC" | jq -r '.ClusterArn' | awk -F/ '{print $NF}')
+
+NODES=$(aws sagemaker list-cluster-nodes --cluster-name "$CLUSTER" --region "$REGION" --output json 2>&1) \
+  || { echo "Error: list-cluster-nodes failed: $NODES" >&2; exit 3; }
+
+# Pick target node
+if [[ -n "$TARGET_NODE" ]]; then
+  TGT_ID="$TARGET_NODE"
+else
+  TGT_ID=$(echo "$NODES" | jq -r '
+    [.ClusterNodeSummaries[] | select(.InstanceGroupName|test("controller|head";"i")|not)][0].InstanceId
+    // .ClusterNodeSummaries[0].InstanceId // empty')
+fi
+[[ -z "$TGT_ID" ]] && { echo "Error: no nodes found in cluster" >&2; exit 3; }
+
+TGT_GROUP=$(echo "$NODES" | jq -r --arg id "$TGT_ID" \
+  '.ClusterNodeSummaries[] | select(.InstanceId==$id) | .InstanceGroupName // empty')
+[[ -z "$TGT_GROUP" ]] && { echo "Error: node $TGT_ID not found in cluster" >&2; exit 3; }
+
+SSM_TARGET="sagemaker-cluster:${CLUSTER_ID}_${TGT_GROUP}-${TGT_ID}"
+
+# ---------------------------------------------------------------------------
+# SSM helper — injection-safe (commands passed via file-based CLI input).
+# Bounded to 60s per call to avoid hangs on unreachable nodes.
+# ---------------------------------------------------------------------------
+ssm_run() {
+  local target="$1"
+  local cmd="$2"
+  local json_file
+  json_file=$(mktemp)
+  # shellcheck disable=SC2064
+  trap "rm -f '$json_file'" RETURN
+
+  # Use --cli-input-json with a file to avoid shell-escaping the command:
+  jq -n --arg t "$target" --arg c "$cmd" '{
+    Target: $t,
+    DocumentName: "AWS-StartNonInteractiveCommand",
+    Parameters: { command: [ $c ] }
+  }' > "$json_file"
+
+  timeout 60 aws ssm start-session --region "$REGION" \
+    --cli-input-json "file://${json_file}" 2>/dev/null \
+    | sed -e 's/\x1b\[[0-9;]*m//g' -e '/Starting session/d' -e '/Exiting session/d'
+}
+
+# ---------------------------------------------------------------------------
+# Category A: placement / topology
+# ---------------------------------------------------------------------------
+section "A. NCCL topology & placement"
+
+# Batch describe-instances (up to 100 IDs per call) instead of one-by-one.
+# Collect all instance IDs from the cluster node list, then chunk into batches.
+mapfile -t ALL_IDS < <(echo "$NODES" | jq -r '.ClusterNodeSummaries[].InstanceId // empty')
+if [[ "${#ALL_IDS[@]}" -eq 0 ]]; then
+  warn "no instance IDs in cluster node list; skipping placement check"
+else
+  PLACEMENTS=""
+  for ((i = 0; i < ${#ALL_IDS[@]}; i += 100)); do
+    batch=( "${ALL_IDS[@]:i:100}" )
+    chunk=$(aws ec2 describe-instances --instance-ids "${batch[@]}" --region "$REGION" \
+      --query 'Reservations[*].Instances[*].[Placement.AvailabilityZone,Placement.GroupName]' \
+      --output text 2>/dev/null) || chunk=""
+    PLACEMENTS+="${chunk}"$'\n'
+  done
+  UNIQ_AZ=$(echo "$PLACEMENTS" | awk 'NF{print $1}' | sort -u | wc -l)
+  UNIQ_PG=$(echo "$PLACEMENTS" | awk 'NF{print $2}' | sort -u | wc -l)
+
+  if (( UNIQ_AZ > 1 )); then
+    bad "nodes span $UNIQ_AZ AZs - cross-AZ traffic will degrade NCCL"
+    HINTS+=("A")
+  else
+    ok "all nodes in a single AZ"
+  fi
+  if (( UNIQ_PG > 1 )); then
+    bad "nodes span $UNIQ_PG placement groups"
+    HINTS+=("A")
+  else
+    ok "all nodes in a single placement group (or none configured)"
+  fi
+fi
+
+# EFA version consistency — sample from target node; user should compare across
+# nodes with hyperpod-version-checker.
+EFA_VER=$(ssm_run "$SSM_TARGET" "cat /opt/amazon/efa/share/VERSION 2>/dev/null || echo unknown" | tr -d '\n\r ')
+info "EFA version on $TGT_ID: ${EFA_VER:-unknown} (compare across nodes via hyperpod-version-checker)"
+
+# ---------------------------------------------------------------------------
+# Category B: filesystem
+# ---------------------------------------------------------------------------
+section "B. Filesystem saturation"
+
+# Scope FSx query to filesystems actually mounted on the target node, not every
+# FSx in the region. This avoids enumerating unrelated filesystems on shared accounts.
+FSIDS_ON_NODE=$(ssm_run "$SSM_TARGET" "mount | awk '/lustre|zfs/ {print \$1}' | grep -oE 'fs-[a-f0-9]+' | sort -u")
+FSIDS_ON_NODE=$(echo "$FSIDS_ON_NODE" | tr -d '\r' | tr '\n' ' ' | awk '{$1=$1};1')
+
+if [[ -z "$FSIDS_ON_NODE" ]]; then
+  info "no FSx filesystems mounted on $TGT_ID"
+else
+  # Split into array for safe expansion (no word-splitting on unquoted var)
+  read -ra FSID_ARRAY <<< "$FSIDS_ON_NODE"
+  FSX_DESC=$(aws fsx describe-file-systems --region "$REGION" \
+    --file-system-ids "${FSID_ARRAY[@]}" --output json 2>/dev/null || echo '{}')
+  FSINFO=$(echo "$FSX_DESC" | jq -r '.FileSystems[]? | "\(.FileSystemId)|\(.FileSystemType)"')
+
+  if [[ -z "$FSINFO" ]]; then
+    warn "FSx filesystems $FSIDS_ON_NODE are mounted but describe-file-systems returned nothing (cross-account?)"
+  else
+    while IFS='|' read -r fsid fstype; do
+      [[ -z "$fsid" ]] && continue
+      # Both LUSTRE and OPENZFS expose DataReadBytes (Bytes, Sum) in AWS/FSx.
+      val=$(aws cloudwatch get-metric-statistics --region "$REGION" \
+        --namespace AWS/FSx --metric-name DataReadBytes \
+        --dimensions "Name=FileSystemId,Value=${fsid}" \
+        --start-time "$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%S 2>/dev/null || date -u -v-1H +%Y-%m-%dT%H:%M:%S)" \
+        --end-time   "$(date -u +%Y-%m-%dT%H:%M:%S)" \
+        --period 60 --statistics Maximum --output json 2>/dev/null \
+        | jq -r '[.Datapoints[].Maximum] | max // 0')
+      info "${fstype} ${fsid}: max 1h DataReadBytes = ${val} bytes/min"
+
+      # For OPENZFS, also report the utilization percent, which is the authoritative saturation signal.
+      if [[ "$fstype" == "OPENZFS" ]]; then
+        util=$(aws cloudwatch get-metric-statistics --region "$REGION" \
+          --namespace AWS/FSx --metric-name FileServerDiskIopsUtilization \
+          --dimensions "Name=FileSystemId,Value=${fsid}" \
+          --start-time "$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%S 2>/dev/null || date -u -v-1H +%Y-%m-%dT%H:%M:%S)" \
+          --end-time   "$(date -u +%Y-%m-%dT%H:%M:%S)" \
+          --period 60 --statistics Maximum --output json 2>/dev/null \
+          | jq -r '[.Datapoints[].Maximum] | max // 0')
+        info "         max 1h FileServerDiskIopsUtilization = ${util}%"
+        # Integer compare only; avoid bc dependency.
+        util_int=${util%.*}
+        if [[ "$util_int" =~ ^[0-9]+$ ]] && (( util_int >= 80 )); then
+          bad "OpenZFS $fsid disk IOPS utilization sustained high (${util}%) - bottleneck likely"
+          HINTS+=("B")
+        fi
+      fi
+    done <<< "$FSINFO"
+    info "review the FSx dashboards for sustained near-provisioned-limit usage (script prints peaks only)"
+  fi
+fi
+
+# On-node iowait (via iostat instead of top — avoids locale-sensitive parsing)
+IOWAIT=$(ssm_run "$SSM_TARGET" "iostat -c 1 2 2>/dev/null | awk 'END{print \$4}'")
+IOWAIT=$(echo "$IOWAIT" | tr -d '\r \n')
+if [[ -n "$IOWAIT" ]]; then
+  # Strip decimal; use integer comparison (no bc dependency).
+  IOWAIT_INT=${IOWAIT%.*}
+  if [[ "$IOWAIT_INT" =~ ^[0-9]+$ ]]; then
+    info "$TGT_ID iowait: ${IOWAIT}%"
+    if (( IOWAIT_INT > 20 )); then
+      warn "high iowait on $TGT_ID - filesystem likely a bottleneck"
+      HINTS+=("B")
+    fi
+  fi
+fi
+
+# ---------------------------------------------------------------------------
+# Category C: GPU health
+# ---------------------------------------------------------------------------
+section "C. GPU health (node $TGT_ID)"
+
+GPU_OUT=$(ssm_run "$SSM_TARGET" "nvidia-smi --query-gpu=index,temperature.gpu,ecc.errors.uncorrected.volatile.total,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.volatile.total --format=csv,noheader,nounits 2>&1 | head -16")
+
+if echo "$GPU_OUT" | grep -qiE 'command not found|no devices|NVIDIA-SMI has failed'; then
+  warn "no NVIDIA GPU on $TGT_ID (Trainium? check 'neuron-ls' via hyperpod-node-debugger § G)"
+else
+  HOT=0; UNCORR_VOL=0; UNCORR_AGG=0; CORR_TOTAL=0; GPUS=0
+  while IFS=',' read -r idx temp unc_vol unc_agg corr_vol; do
+    idx=$(echo "$idx" | tr -d ' '); [[ -z "$idx" ]] && continue
+    temp=$(echo "$temp" | tr -d ' ')
+    unc_vol=$(echo "$unc_vol" | tr -d ' ')
+    unc_agg=$(echo "$unc_agg" | tr -d ' ')
+    corr_vol=$(echo "$corr_vol" | tr -d ' ')
+
+    GPUS=$((GPUS+1))
+    [[ "$temp"     =~ ^[0-9]+$ && "$temp"     -ge 85 ]] && HOT=$((HOT+1))
+    [[ "$unc_vol"  =~ ^[0-9]+$ && "$unc_vol"  -gt 0  ]] && UNCORR_VOL=$((UNCORR_VOL+1))
+    [[ "$unc_agg"  =~ ^[0-9]+$ && "$unc_agg"  -gt 0  ]] && UNCORR_AGG=$((UNCORR_AGG+1))
+    [[ "$corr_vol" =~ ^[0-9]+$ ]] && CORR_TOTAL=$((CORR_TOTAL + corr_vol))
+  done <<< "$GPU_OUT"
+
+  info "$GPUS GPUs visible on $TGT_ID"
+
+  (( HOT > 0 )) && { bad "$HOT GPU(s) thermal-throttling (>= 85C)"; HINTS+=("C"); }
+  (( UNCORR_VOL > 0 )) && { bad "$UNCORR_VOL GPU(s) with UNCORRECTABLE ECC (volatile) - drain & replace"; HINTS+=("C"); }
+  (( UNCORR_AGG > 0 )) && { bad "$UNCORR_AGG GPU(s) with UNCORRECTABLE ECC (aggregate/lifetime) - drain & replace"; HINTS+=("C"); }
+
+  # Corrected-ECC threshold: compare against an approximate per-day rate derived
+  # from uptime, not a raw cumulative value. Without uptime we skip this check
+  # rather than false-positive on long-running nodes.
+  UPTIME_DAYS=$(ssm_run "$SSM_TARGET" "awk '{printf \"%d\", \$1/86400}' /proc/uptime 2>/dev/null || echo 0")
+  UPTIME_DAYS=$(echo "$UPTIME_DAYS" | tr -d '\r\n ')
+  if [[ "$UPTIME_DAYS" =~ ^[1-9][0-9]*$ ]] && [[ "$UPTIME_DAYS" -gt 0 ]]; then
+    PER_DAY=$(( CORR_TOTAL / UPTIME_DAYS ))
+    info "corrected ECC: $CORR_TOTAL since driver load over ${UPTIME_DAYS}d = ~${PER_DAY}/day"
+    (( PER_DAY > 1000 )) && { warn "corrected ECC rate > 1000/day - pre-failure, schedule replacement"; HINTS+=("C"); }
+  else
+    info "corrected ECC (volatile) total: $CORR_TOTAL (no uptime; compare to aggregate next cycle)"
+  fi
+
+  if (( HOT == 0 && UNCORR_VOL == 0 && UNCORR_AGG == 0 )); then
+    [[ "${UPTIME_DAYS:-0}" =~ ^[0-9]+$ ]] && [[ "${PER_DAY:-0}" -le 1000 ]] && \
+      ok "no thermal or ECC issues on $TGT_ID"
+  fi
+fi
+
+# Recent Xid errors (dmesg is readable unprivileged on HyperPod DLAMIs
+# because kernel.dmesg_restrict=0; no sudo needed).
+XID=$(ssm_run "$SSM_TARGET" "dmesg -T 2>/dev/null | grep -i 'Xid' | tail -5")
+if [[ -n "$XID" ]]; then
+  warn "recent Xid in dmesg on $TGT_ID:"
+  echo "$XID" | sed 's/^/         /'
+  HINTS+=("C")
+else
+  ok "no Xid errors in recent dmesg"
+fi
+
+# ---------------------------------------------------------------------------
+# Verdict
+# ---------------------------------------------------------------------------
+section "Verdict"
+if [[ ${#HINTS[@]} -eq 0 ]]; then
+  ok "no obvious issues detected in any category"
+  info "if the user still reports slowness, delegate to hyperpod-mfu-debugger for full triage"
+else
+  # Dedupe
+  mapfile -t UNIQ < <(printf '%s\n' "${HINTS[@]}" | sort -u)
+  for h in "${UNIQ[@]}"; do
+    case "$h" in
+      A) printf "  ${BOLD}-> Go to Section A: Uneven NCCL Performance (SKILL.md)${NC}\n" ;;
+      B) printf "  ${BOLD}-> Go to Section B: Poor Filesystem Performance (SKILL.md)${NC}\n" ;;
+      C) printf "  ${BOLD}-> Go to Section C: Suspected GPU Failure (SKILL.md)${NC}\n" ;;
+    esac
+  done
+fi
+
+printf "\n"
+info "sampled one node: $TGT_ID in group $TGT_GROUP"
+info "re-run with --node <INSTANCE_ID> to target a specific node"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/SKILL.md
new file mode 100644
index 00000000..688ca49a
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/SKILL.md
@@ -0,0 +1,238 @@
+---
+name: hyperpod-slurm-debugger
+description: Diagnoses and safely recovers Slurm node-management issues on Amazon SageMaker HyperPod Slurm clusters. Use when a user reports — a Slurm node stuck in down/drain, "Node unexpectedly rebooted" after auto-repair, slurmd not running, jobs stuck PENDING with REASON=Resources while sinfo shows idle nodes, jobs stuck COMPLETING after node replacement, GRES/GPU counts wrong, scontrol ping failing, or slurmctld unresponsive. Also triggers on "drain before reboot", "resume a Slurm node", "recover Slurm after auto-repair", or "replace a faulty Slurm node". Do NOT use for EKS/Kubernetes-orchestrated HyperPod (use hyperpod-node-debugger or hyperpod-nccl), single-node GPU/hardware faults where Slurm is healthy (use hyperpod-node-debugger), or NCCL-specific training hangs (use hyperpod-nccl).
+metadata:
+  version: "1.0.0"
+---
+
+# HyperPod Slurm Debugger
+
+Triage and safely recover Slurm scheduler and node-daemon issues on Amazon SageMaker HyperPod
+Slurm clusters. Focused on the three common failure modes:
+
+1. **Node `down` / not responding** — `slurmd` stopped, resource exhaustion, or network.
+2. **Node `down*` with reason "Node unexpectedly rebooted"** — `slurmd` didn't re-register after
+   an out-of-band reboot (panic, manual reboot, HyperPod auto-repair).
+3. **Jobs stuck `PENDING` / `COMPLETING`** — `slurmctld` in-memory state out of sync.
+
+Access runs over **AWS Systems Manager (SSM)** — HyperPod does not expose SSH by default. The
+default mode is **read-only inspection**. Every state change requires the explicit `--fix` flag,
+and the highest-risk changes (Slurm controller restart, node reboot/replace) are additionally
+gated and always announced before execution.
+
+## Important — read before using `--fix`
+
+The AWS documentation recommends using the HyperPod-native recovery path whenever possible:
+
+```bash
+# HyperPod-orchestrated reboot (preferred)
+scontrol update node=<NODE> state=fail reason="Action:Reboot"
+
+# HyperPod-orchestrated replace (preferred for hardware issues)
+scontrol update node=<NODE> state=fail reason="Action:Replace"
+```
+
+When you set `reason="Action:Reboot"` or `reason="Action:Replace"` via `scontrol`, HyperPod
+detects the annotation and orchestrates the recovery. **While a replacement is in progress, do
+not change the node state again or restart `slurmctld`** — concurrent changes can cause
+replacement failures. See
+[references/slurm-details.md → HyperPod-native recovery](references/slurm-details.md#hyperpod-native-recovery).
+
+The `--fix` mode in this skill handles only the safe, routine cases (re-enable `slurmd`, resume
+a node that's already healthy, restart `slurmctld` when it is truly wedged). For anything
+beyond that, the skill prints the recommended command and waits for you to run it manually.
+
+## Prerequisites
+
+- AWS CLI v2, authenticated for the target account and region with:
+  - `sagemaker:DescribeCluster`, `sagemaker:ListClusterNodes`
+  - `ssm:StartSession` on the HyperPod-created SSM document
+- [Session Manager plugin](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html)
+  installed locally (required for `aws ssm start-session`).
+- `jq` ≥ 1.6. The diagnostic script builds SSM payloads with `jq` to avoid shell-injection on
+  node names / reasons.
+- Ability to execute `scripts/slurm-diagnose-fix.sh` (the file ships with the `+x` bit set).
+
+## Defaults
+
+| Behavior             | Default                                                                    | Override                   |
+| -------------------- | -------------------------------------------------------------------------- | -------------------------- |
+| Mode                 | **inspect-only** — no state changes                                        | `--fix`                    |
+| Region               | `$AWS_DEFAULT_REGION`, falling back to `us-east-1`                         | `--region <R>`             |
+| Scope                | all nodes currently `down` / `drain` / marked "unexpectedly rebooted"      | `--node <SLURM_NODE_NAME>` |
+| Controller restart   | **never** automatic; announced, requires user to type `yes` at prompt      | n/a                        |
+| Output               | colorized terminal                                                         | `--no-color`               |
+| SSM target format    | `sagemaker-cluster:<clusterId>_<instanceGroupName>-<instanceId>` (derived) | n/a                        |
+| Controller discovery | first `InstanceGroup` whose name matches `/controller\|head/i`, else `[0]` | n/a                        |
+
+## Error Handling
+
+| Failure                                                | Skill behavior                                          | User action                                           |
+| ------------------------------------------------------ | ------------------------------------------------------- | ----------------------------------------------------- |
+| `describe-cluster` fails                               | Prints AWS error, exits 1                               | Fix credentials/region; verify cluster name           |
+| Cluster has `Orchestrator.Eks`                         | Exits 1 with pointer to EKS-side skills                 | Use `hyperpod-node-debugger` or `hyperpod-nccl`       |
+| `session-manager-plugin` missing / SSM unreachable     | `sinfo` returns empty → exits 1, prints the raw command | Install plugin; verify node `InService`               |
+| Disk ≥ 95 % full on a `down` node                      | Skipped by `--fix`, reported as `disk-full-<node>`      | Clean up per references/slurm-details.md, then re-run |
+| `slurmd` restart succeeds but Slurm still shows `down` | Warns, no further auto-fix                              | Hardware suspected — use `hyperpod-node-debugger`     |
+| Controller restart prompt declined                     | Prints the exact command and escalation path            | Run manually when confident the cluster is idle       |
+| Missing `jq` or `aws`                                  | Script exits 1 at prereq check                          | Install per Prerequisites                             |
+
+## Examples
+
+**Example 1 — two nodes stuck `down*` after auto-repair**
+
+> "My HyperPod Slurm cluster `ml-research` in `us-west-2` just did an auto-repair and two
+> nodes are stuck `down*` with reason `Node unexpectedly rebooted`. Can you check and fix
+> them?"
+
+The skill runs the script in inspect mode, identifies both nodes under Section B, confirms
+`slurmd` is running on each via SSM, then re-runs with `--fix` to `systemctl enable slurmd` +
+`scontrol update state=resume` on each.
+
+### Example 2 — pending queue with idle nodes
+
+> "Eight jobs are stuck `PENDING` with `REASON=Resources` on my Slurm HyperPod cluster
+> `llm-train`, but `sinfo` shows the partition idle."
+
+The skill classifies this as `stuck-pending-with-idle-nodes` (Section C), prints the
+recommended `slurmctld` restart command and the expected preservation behavior, and — if the
+user confirms at the prompt — runs `systemctl restart slurmctld` on the controller. Running
+jobs, the pending queue, and node states are recovered from `StateSaveLocation`.
+
+---
+
+## Step 1: Collect information
+
+Ask the user for:
+
+- **HyperPod cluster name** (not the Slurm partition name).
+- **AWS region**.
+- **A specific Slurm node name**, if one is known-bad (optional — the script auto-detects).
+- Whether the user has confirmed this is a HyperPod _Slurm_ cluster:
+
+  ```bash
+  aws sagemaker describe-cluster --cluster-name <NAME> --region <REGION> \
+    --query 'Orchestrator' --output json
+  ```
+
+  If `Orchestrator.Eks` is set, stop — route to `hyperpod-nccl` or `hyperpod-node-debugger`.
+
+The script auto-discovers the controller node from `DescribeCluster.InstanceGroups`.
+
+## Step 2: Run diagnostics
+
+```bash
+# Inspect only — safe, no state changes.
+# Collects sinfo, scontrol show nodes, squeue, and per-node slurmd status for
+# any node currently down / drain / unknown.
+bash scripts/slurm-diagnose-fix.sh --cluster <NAME> --region <REGION>
+
+# Scope to a specific Slurm node:
+bash scripts/slurm-diagnose-fix.sh --cluster <NAME> --region <REGION> --node <SLURM_NODE>
+
+# Apply safe auto-fixes (see "What gets auto-fixed" below). For slurmctld restart,
+# the script always prompts before acting.
+bash scripts/slurm-diagnose-fix.sh --cluster <NAME> --region <REGION> --fix
+```
+
+The script prints a decision tree mapping the detected symptom to one of the three sections
+below.
+
+## Step 3: Match symptom → section
+
+| Symptom from `sinfo -o "%N %T %30E"`                        | Section                                      |
+| ----------------------------------------------------------- | -------------------------------------------- |
+| Node state = `down` / `down*`, any reason other than below  | [A: Node Down](#a-node-down)                 |
+| Node state = `down*`, Reason = `Node unexpectedly rebooted` | [B: Unexpected Reboot](#b-unexpected-reboot) |
+| Jobs `PENDING` with `REASON=Resources` despite idle nodes   | [C: Controller State](#c-controller-state)   |
+| Jobs stuck `COMPLETING` after node replacement              | [C: Controller State](#c-controller-state)   |
+| `scontrol ping` returns `DOWN` for the controller           | [C: Controller State](#c-controller-state)   |
+| GRES (GPU) counts wrong or not released after a job ends    | [C: Controller State](#c-controller-state)   |
+
+---
+
+## A: Node Down
+
+Slurm marks a node `down` when `slurmd` stops responding to the controller. Root causes (in
+order of frequency): `slurmd` crashed/stopped, disk full or OOM, network partition, or hardware
+failure (route to `hyperpod-node-debugger` Section G).
+
+The script inspects reachability and daemon status, then `--fix` restarts `slurmd` and resumes
+the node. If the node flaps, use HyperPod-native reboot/replace.
+
+Full diagnostic and recovery procedures:
+[references/slurm-details.md § A](references/slurm-details.md#-a-node-down--recovery-procedures).
+
+---
+
+## B: Unexpected Reboot
+
+Slurm marks a node `down*` with Reason `"Node unexpectedly rebooted"` when it re-registers
+after an out-of-band reboot (kernel panic, watchdog, manual reboot, HyperPod auto-repair).
+This is standard upstream Slurm protection — the controller refuses to schedule until an admin
+confirms the node is healthy.
+
+**The node is usually fine.** The script's `--fix` mode ensures `slurmd` is enabled/running,
+then resumes the node. If the node reboots again shortly after, route to
+`hyperpod-node-debugger` (hardware).
+
+Full recovery and prevention procedures:
+[references/slurm-details.md § B](references/slurm-details.md#-b-unexpected-reboot--recovery-procedures).
+
+---
+
+## C: Controller State
+
+`slurmctld` can get into a bad in-memory state whose symptoms look like scheduler bugs —
+PENDING jobs with idle nodes, stuck COMPLETING, wrong GRES counts. A **controller restart
+preserves the cluster's important state** (running jobs, pending queue, node state) because
+those are re-read from `StateSaveLocation` on disk, and only the in-memory caches are cleared.
+
+### When a restart helps
+
+| Symptom                                                       | Why restart helps                                  |
+| ------------------------------------------------------------- | -------------------------------------------------- |
+| `PENDING` with `REASON=Resources`, `sinfo` shows idle nodes   | Re-evaluates the queue                             |
+| Jobs stuck `COMPLETING` after node replacement                | Controller was holding a reference to the old node |
+| GRES (GPU, EFA) not released after a job ends                 | Resource accounting de-synced                      |
+| Nodes stuck `Unknown` after reboot even though `slurmd` is up | Re-registration wasn't processed                   |
+| `scontrol ping` times out                                     | Controller event loop is hung                      |
+| Lost connection to `slurmdbd` / RPC errors in controller logs | Database connection wedged                         |
+
+### When NOT to restart
+
+- While a HyperPod replacement (`Action:Replace`) is in progress on any node. AWS docs: _"Avoid
+  changing the node state or restarting the Slurm controller during the operation."_
+- When only one compute node is misbehaving — restart `slurmd` on that node instead.
+- When `sinfo` / `squeue` are responsive and show expected state — the problem is elsewhere.
+- Before checking `journalctl -u slurmctld` for panics or OOM patterns — restart will just
+  reproduce them.
+- Right after editing `slurm.conf` — try `scontrol reconfigure` first; restart is the fallback.
+
+Full restart procedures (standard and force-kill):
+[references/slurm-details.md § C](references/slurm-details.md#-c-controller-restart-procedures).
+
+---
+
+## What gets auto-fixed by `--fix`
+
+Safe, narrowly-scoped actions only. Anything higher-risk is announced and requires an
+interactive confirmation.
+
+| Detected condition                                              | Automatic action                                            |
+| --------------------------------------------------------------- | ----------------------------------------------------------- |
+| `slurmd` inactive on a `down` node                              | `systemctl enable slurmd && systemctl start slurmd` via SSM |
+| Node in `down*` with Reason `Node unexpectedly rebooted`        | `scontrol update nodename=<N> state=resume`                 |
+| `slurmd` was restarted and is now active                        | `scontrol update nodename=<N> state=resume`                 |
+| `scontrol ping` fails **and** logs show a hang or RPC timeout   | Prompts; on `yes` runs `systemctl restart slurmctld`        |
+| Jobs stuck `COMPLETING` > 10 min after a known node replacement | Prompts; on `yes` runs `systemctl restart slurmctld`        |
+
+## What requires escalation (script prints the next skill)
+
+| Condition                                                      | Next skill                            |
+| -------------------------------------------------------------- | ------------------------------------- |
+| Node flaps back to `down` within 5 min of resume               | `hyperpod-node-debugger` (hardware)   |
+| `slurmd` logs show CUDA / NVIDIA / XID errors                  | `hyperpod-node-debugger` § G          |
+| Disk full or `/dev/shm` exhausted                              | `hyperpod-node-debugger` § I          |
+| Node unreachable via SSM                                       | `hyperpod-ssm`                        |
+| Controller restart doesn't clear `COMPLETING` after 2 attempts | `hyperpod-issue-report` + AWS Support |
diff --git a/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/references/slurm-details.md b/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/references/slurm-details.md
new file mode 100644
index 00000000..3cef2b4c
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/references/slurm-details.md
@@ -0,0 +1,495 @@
+# Slurm Details
+
+Supplementary procedures for `hyperpod-slurm-debugger`. Read this when SKILL.md points here
+for deeper context or when you want to run a step manually.
+
+## Table of contents
+
+- [HyperPod-native recovery](#hyperpod-native-recovery)
+- [Node down — full diagnostic playbook](#node-down--full-diagnostic-playbook)
+- [Disk cleanup before resume](#disk-cleanup-before-resume)
+- [Node unexpectedly rebooted — why Slurm protects you](#node-unexpectedly-rebooted--why-slurm-protects-you)
+- [Controller restart — what's preserved vs reset](#controller-restart--whats-preserved-vs-reset)
+- [When NOT to restart `slurmctld`](#when-not-to-restart-slurmctld)
+- [Node name → instance ID](#node-name--instance-id)
+- [Prevention checklist](#prevention-checklist)
+- [Quick command reference](#quick-command-reference)
+
+---
+
+## HyperPod-native recovery
+
+AWS provides two equivalent paths for rebooting or replacing a HyperPod node: the
+`BatchRebootClusterNodes` / `BatchReplaceClusterNodes` APIs, or `scontrol update` with an
+`Action:*` reason string that HyperPod recognises. Both trigger the same underlying recovery
+workflow.
+
+### Reboot
+
+```bash
+# Via scontrol (from the head node — requires Slurm access):
+sudo scontrol update node=<NODE> state=fail reason="Action:Reboot"
+
+# Via the API (from anywhere with AWS credentials):
+aws sagemaker batch-reboot-cluster-nodes \
+  --cluster-name <CLUSTER_NAME_OR_ARN> \
+  --region <REGION> \
+  --node-ids i-0123456789abcdef0 i-0fedcba9876543210
+```
+
+Use reboot for transient or software problems — system hangs, memory leaks, hung processes,
+kernel updates, GPU-driver state to clear.
+
+### Replace
+
+```bash
+# Via scontrol:
+sudo scontrol update node=<NODE> state=fail reason="Action:Replace"
+
+# Via the API:
+aws sagemaker batch-replace-cluster-nodes \
+  --cluster-name <CLUSTER_NAME_OR_ARN> \
+  --region <REGION> \
+  --node-ids i-0123456789abcdef0
+```
+
+Replace provisions fresh hardware with the same AMI, same configuration, and the same host
+name, but **everything on local storage is lost** — instance-store volumes, anything on `/`
+that isn't on a shared FSx or persistent EBS volume. A replace can take several minutes
+depending on capacity and lifecycle-script runtime.
+
+When you run the `scontrol update … state=fail reason="Action:Replace"` form, Slurm first
+waits for running jobs on the node to finish, then HyperPod replaces the instance and Slurm
+re-registers it with the same host name. **While the replacement is in progress, do not
+change the node state again and do not restart `slurmctld`** — concurrent changes can cause
+the replacement to fail.
+
+If `auto-resume` is enabled on a job (`srun --auto-resume=1`), HyperPod always replaces nodes
+on hardware failures rather than rebooting.
+
+### Force-down (last resort)
+
+If a node is stuck in `fail` indefinitely and neither reboot nor replace progresses:
+
+```bash
+sudo scontrol update node=<NODE> state=down reason="Action:Replace"
+```
+
+This is the documented escape hatch. It **force-kills every job on the node** — you lose any
+unsaved progress. Use only after AWS Support has advised it, or after exhausting the
+non-destructive options.
+
+---
+
+## Node down — full diagnostic playbook
+
+The script automates these steps; use this list if you're debugging manually or want to
+understand what's being checked.
+
+1. **Get node state and reason:**
+
+   ```bash
+   sinfo -N -l                          # per-node detail
+   sinfo -o "%N %T %30E"                # concise state + reason
+   scontrol show node <NODE>            # Reason, LastBusyTime, Boot, CfgTRES
+   ```
+
+2. **Check the HyperPod-side instance view:**
+
+   ```bash
+   aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+     --query 'ClusterNodeSummaries[?contains(PrivateDnsName, `<NODE>`)]'
+   ```
+
+3. **Test reachability at each layer — isolates what is broken:**
+
+   ```bash
+   ping <NODE>                          # L3 / ICMP
+   ssh  <NODE> true                     # SSH (if configured)
+   srun -w <NODE> hostname              # Slurm RPC
+   ```
+
+   - Ping OK, Slurm fails → `slurmd` down, or munge auth broken.
+   - Ping fails → network or security-group issue.
+
+4. **On the node (via SSM):**
+
+   ```bash
+   sudo systemctl status slurmd
+   sudo journalctl -u slurmd -n 200 --no-pager
+   sudo journalctl -xe -n 100 --no-pager   # kernel errors, OOM kills
+   free -h
+   df -h                                # disk full?
+   df -h /dev/shm                       # shared-memory exhaustion?
+   ```
+
+5. **Map what you found to a fix:**
+
+   | Finding                                    | Fix                                                          |
+   | ------------------------------------------ | ------------------------------------------------------------ |
+   | `slurmd` stopped, logs look clean          | `systemctl start slurmd` → `scontrol ... resume`             |
+   | `slurmd` keeps dying, munge errors in logs | Restart `munge`, then `slurmd`                               |
+   | Disk full                                  | See [Disk cleanup](#disk-cleanup-before-resume)              |
+   | OOM in `dmesg`                             | Right-size the workload, or larger instance                  |
+   | Kernel panic / recent reboot               | See [Section B in SKILL.md](../SKILL.md#b-unexpected-reboot) |
+   | Hardware errors (GPU XID, ECC) in `dmesg`  | `hyperpod-node-debugger` § G                                 |
+
+## Disk cleanup before resume
+
+If `df -h /` shows the root volume full, clean up **before** restarting `slurmd` — otherwise it
+refuses to start or dies again immediately.
+
+HyperPod's default root volume is often small relative to container image and checkpoint
+workloads. For anything larger than the root volume, write to `/opt/sagemaker` (persistent
+EBS) or `/opt/dlami/nvme` (instance-store) instead of `/`, or attach a shared FSx mount.
+See `hyperpod-node-debugger` § I.
+
+```bash
+# Identify the culprit:
+sudo du -h --max-depth=1 / 2>/dev/null | sort -hr | head -20
+sudo du -sh /var/log/* 2>/dev/null      | sort -hr | head -10
+sudo du -sh /tmp/*     2>/dev/null      | sort -hr | head -10
+
+# Safe cleanups:
+sudo rm -f /var/log/*.log.* /var/log/*/*.gz /var/log/*.[0-9] /var/log/*/*.[0-9]
+sudo journalctl --vacuum-time=2d
+
+# Only if no active training / runtime has open files in these paths:
+sudo rm -rf /tmp/* /var/tmp/*
+
+# Distro-specific:
+sudo apt-get clean               # Ubuntu / Debian
+docker system prune -af          # only if Docker is idle on this node
+```
+
+## Node unexpectedly rebooted — why Slurm protects you
+
+When a node reboots out-of-band (kernel panic, hardware watchdog, manual `reboot`, HyperPod
+auto-repair), the controller notices the boot time changed the moment `slurmd` re-registers.
+Slurm marks the node `down*` with reason `Node unexpectedly rebooted` and refuses scheduling.
+This is **upstream Slurm behaviour, not HyperPod-specific** — it protects pending jobs from
+landing on a node whose local state may have been corrupted (partial checkpoints on
+`/opt/dlami/nvme`, half-written scratch files).
+
+To acknowledge the node is healthy and resume it:
+
+```bash
+# On node: make sure slurmd will stay up across boots
+sudo systemctl is-enabled slurmd || sudo systemctl enable slurmd
+sudo systemctl is-active  slurmd || sudo systemctl start  slurmd
+
+# On head node:
+sudo scontrol update nodename=<NODE> state=resume
+```
+
+If the node reboots again shortly after — or loops through reboots — it's hardware or a
+kernel panic, not a Slurm issue. Check `dmesg` and `journalctl -b -1` (previous boot) before
+resuming again.
+
+## Controller restart — what's preserved vs reset
+
+A standard `systemctl restart slurmctld` reloads the controller from its on-disk state at
+`StateSaveLocation` (often `/var/spool/slurm/ctld/` on HyperPod). The `slurmctld(8)` man page
+confirms this behaviour: without the `-c` flag, "previously running jobs will be preserved
+along with node State of DOWN, DRAINED and DRAINING nodes".
+
+**Preserved:**
+
+- Running jobs — they continue executing on compute nodes and reconnect when the controller
+  comes back up.
+- Pending job queue — `squeue` returns the same queue.
+- Node states for `DOWN`, `DRAINED`, `DRAINING` nodes, including the Reason field.
+- Accounting records (via `slurmdbd`).
+- Partition definitions (re-read from `slurm.conf`).
+
+**Reset — which is exactly what fixes the bad-cache symptoms:**
+
+- In-memory scheduling decisions and priority calculations.
+- GRES / TRES accounting caches.
+- Hung RPC connections to compute nodes.
+- Stale `REASON=Resources` annotations on pending jobs.
+- Stuck `COMPLETING` tracking.
+
+Verify the state directory is intact _before_ a force-kill restart:
+
+```bash
+scontrol show config | grep StateSaveLocation
+sudo ls -la /var/spool/slurm/ctld/       # should have recent files
+```
+
+If the directory is missing or empty, do **not** restart — recover the state file from backup
+first. `slurmctld -c` (clean start) purges every job from the controller's view.
+
+## When NOT to restart `slurmctld`
+
+Restart is cheap but not free. Skip it when:
+
+- A HyperPod replacement (`Action:Replace`) is currently in progress on any node. AWS
+  documents that concurrent controller restarts can cause the replacement to fail.
+- A single compute node is misbehaving — restart `slurmd` on that node. Restarting `slurmctld`
+  affects every node.
+- `sinfo` / `squeue` respond and show expected state — the problem is elsewhere (network,
+  quota, job script).
+- You haven't read `journalctl -u slurmctld` yet — if there's a panic or OOM-kill pattern,
+  restart will reproduce it.
+- The user just edited `slurm.conf` — try `scontrol reconfigure` first; restart is the fallback.
+
+## Node name → instance ID
+
+HyperPod Slurm nodes are named by private IP (`ip-10-1-123-45`), but the `batch-reboot-cluster-nodes`,
+`batch-replace-cluster-nodes`, and SSM APIs need the EC2 instance ID.
+
+### Option 1 — from the head node's resource config (fastest, one node)
+
+```bash
+NODE="ip-10-1-123-45"
+IP=$(echo "$NODE" | sed 's/ip-//; s/-/./g')
+sudo python3 - <<'PY'
+import json, os
+d = json.load(open('/opt/ml/config/resource_config.json'))
+ip = os.environ['IP']
+for g in d.get('InstanceGroups', []):
+    for i in g.get('Instances', []):
+        if i.get('CustomerIpAddress') == ip:
+            print(i.get('InstanceId'), g.get('Name'))
+PY
+```
+
+### Option 2 — AWS API (works from anywhere)
+
+```bash
+aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+  --query "ClusterNodeSummaries[?starts_with(PrivateDnsName, '${NODE}.')].{ID:InstanceId,Group:InstanceGroupName}" \
+  --output table
+```
+
+The DNS suffix is `<region>.compute.internal` in most regions and `ec2.internal` in
+`us-east-1`.
+
+### Option 3 — bulk lookup (large clusters, recurring operations)
+
+Use the [`dump_cluster_nodes_info.py`](https://github.com/aws-samples/awsome-distributed-training/blob/main/1.architectures/5.sagemaker-hyperpod/tools/dump_cluster_nodes_info.py)
+helper from `aws-samples/awsome-distributed-training` to dump everything to CSV once, then
+grep it:
+
+```bash
+python3 dump_cluster_nodes_info.py --cluster-name <C>
+grep "10.1.123.45" cluster_nodes_info.csv
+```
+
+## Prevention checklist
+
+- **Enable `slurmd` on boot on every node.** This single setting prevents most "unexpected
+  reboot" recoveries from ever being needed:
+
+  ```bash
+  sudo systemctl enable slurmd
+  ```
+
+  The default HyperPod Slurm lifecycle script does this; re-verify if you've customised it.
+
+- **Always drain before an intentional reboot:**
+
+  ```bash
+  sudo scontrol update nodename=<N> state=drain reason="Planned reboot"
+  sudo scontrol update node=<N>     state=fail  reason="Action:Reboot"
+  # After the node comes back and slurmd is running:
+  sudo scontrol update nodename=<N> state=resume
+  ```
+
+- **Back up `slurm.conf` and `topology.conf`** before edits. The config reload path can be
+  fragile around topology changes.
+
+- **Monitor `slurmctld` memory.** Large clusters (> 200 nodes) can drive controller memory
+  up over time; restart during maintenance windows before it OOMs during a job.
+
+- **Don't over-automate `state=resume`.** If a hook auto-resumes a flapping node, you'll mask
+  a real hardware problem. The script's `--fix` mode only resumes after confirming `slurmd`
+  is actually running on the node.
+
+- **Use `--auto-resume=1` on long-running training jobs.** When HyperPod detects a hardware
+  failure on a job with auto-resume, it replaces the node and re-runs the job step from the
+  last checkpoint.
+
+## Quick command reference
+
+### Inspection (read-only, always safe)
+
+```bash
+sinfo                                # cluster summary
+sinfo -N -l                          # per-node, long format
+sinfo -o "%N %T %30E"                # state + reason (best for triage)
+scontrol show node <NODE>            # full node detail
+scontrol show job <JOBID>            # why a job is pending / completing
+scontrol ping                        # controller health
+squeue                               # running + pending jobs
+squeue -o "%i %T %r %N"              # job state + reason + node
+journalctl -u slurmd    -n 200       # node daemon logs
+journalctl -u slurmctld -n 200       # controller logs
+```
+
+### State changes (require sudo, change cluster behaviour)
+
+```bash
+# Slurm daemons on a compute node (via SSM):
+sudo systemctl status  slurmd
+sudo systemctl restart slurmd
+sudo systemctl enable  slurmd
+
+# Controller on head node — read "When NOT to restart" before running:
+sudo systemctl restart slurmctld
+sudo pkill -9 slurmctld && sudo systemctl start slurmctld   # only if truly hung
+
+# Node state on head node:
+sudo scontrol update nodename=<N> state=drain reason="..."
+sudo scontrol update nodename=<N> state=resume
+sudo scontrol reconfigure                    # reload slurm.conf without restart
+
+# HyperPod-native recovery (preferred for reboot/replace):
+sudo scontrol update node=<N> state=fail reason="Action:Reboot"
+sudo scontrol update node=<N> state=fail reason="Action:Replace"
+
+# Or the equivalent APIs:
+aws sagemaker batch-reboot-cluster-nodes  --cluster-name <C> --region <R> --node-ids <ID>
+aws sagemaker batch-replace-cluster-nodes --cluster-name <C> --region <R> --node-ids <ID>
+```
+
+---
+
+## § A: Node Down — recovery procedures
+
+These are the detailed recovery steps referenced from SKILL.md Section A.
+
+**Inspect** (the script does this automatically):
+
+```bash
+# On the head node:
+sinfo -o "%N %T %30E" | grep -E 'down|drain'
+scontrol show node <NODE>           # Reason, LastBusyTime, Boot
+
+# Test each layer — isolates what is broken:
+ping <NODE>                          # L3
+srun -w <NODE> hostname              # Slurm RPC
+ssh <NODE> true                      # SSH (only if keys are configured)
+
+# On the affected node (via SSM):
+sudo systemctl status slurmd
+sudo journalctl -u slurmd -n 200 --no-pager
+free -h && df -h                     # OOM / disk-full
+```
+
+**Recover** (in order — the script's `--fix` covers steps 1–3):
+
+1. If `/` is ≥ 95 % full, clean up first — `slurmd` will die again immediately otherwise. See
+   [Disk cleanup before resume](#disk-cleanup-before-resume).
+2. Start `slurmd` on the node:
+
+   ```bash
+   sudo systemctl enable slurmd   # so it auto-starts on future boots
+   sudo systemctl start  slurmd
+   ```
+
+3. From the head node, return the node to service:
+
+   ```bash
+   sudo scontrol update nodename=<NODE> state=resume
+   ```
+
+4. If `slurmd` won't stay up or the node flaps back to `down` within a few minutes, use the
+   HyperPod-native reboot path:
+
+   ```bash
+   sudo scontrol update node=<NODE> state=fail reason="Action:Reboot"
+   ```
+
+5. If a reboot doesn't clear it (repeated ECC errors, kernel panics, GPU XID), use the
+   HyperPod-native replace path. **This destroys local state on the instance** (instance-store
+   volumes, anything on `/`):
+
+   ```bash
+   sudo scontrol update node=<NODE> state=fail reason="Action:Replace"
+   ```
+
+   After a replace, running jobs on that node are lost unless `--auto-resume=1` was used to
+   launch them.
+
+Slurm node names are IP-based (`ip-10-1-123-45`); translate to an EC2 instance ID with
+[Node name → instance ID](#node-name--instance-id) when you need to call the `batch-*` APIs
+directly.
+
+---
+
+## § B: Unexpected Reboot — recovery procedures
+
+These are the detailed recovery steps referenced from SKILL.md Section B.
+
+Common triggers on HyperPod: kernel panic, hardware watchdog, manual `reboot`, and HyperPod
+auto-repair actions.
+
+**The node is usually fine.** Recover with:
+
+```bash
+# 1. On the node (via SSM) — make sure slurmd stays up across future boots.
+sudo systemctl is-enabled slurmd || sudo systemctl enable slurmd
+sudo systemctl is-active  slurmd || sudo systemctl start  slurmd
+
+# 2. On the head node:
+sudo scontrol update nodename=<NODE> state=resume
+
+# 3. Verify:
+sinfo -N -l | grep <NODE>     # should be idle or alloc, not down
+```
+
+If the node reboots _again_ within a short window, stop resuming it — that points to a kernel
+loop or a hardware problem. Check `dmesg` and `journalctl -b -1` (previous boot) first, and
+route to `hyperpod-node-debugger`.
+
+**Prevent this during intentional reboots** by draining first:
+
+```bash
+sudo scontrol update nodename=<NODE> state=drain reason="Planned reboot"
+# Reboot via the HyperPod-native path (preferred):
+sudo scontrol update node=<NODE> state=fail reason="Action:Reboot"
+# After it comes back healthy:
+sudo scontrol update nodename=<NODE> state=resume
+```
+
+For a bulk auto-repair aftermath, the script's `--fix` mode handles the resume for every node
+currently in this specific state.
+
+---
+
+## § C: Controller restart procedures
+
+These are the detailed restart procedures referenced from SKILL.md Section C.
+
+### Standard restart (preserves state)
+
+```bash
+sudo systemctl restart slurmctld
+sudo systemctl status  slurmctld
+sudo journalctl -u slurmctld -n 100 --no-pager
+
+# Verify:
+sinfo                                 # all nodes in expected states
+squeue                                # running jobs still there
+scontrol ping                         # controller responding
+scontrol show config | grep StateSaveLocation   # state dir intact
+```
+
+### If the daemon is fully hung
+
+Only after a normal stop has timed out, and only when `StateSaveLocation` has been verified
+intact:
+
+```bash
+scontrol show config | grep StateSaveLocation
+sudo ls -la /var/spool/slurm/ctld/     # should have recent state files
+
+sudo systemctl stop slurmctld || true
+sudo pkill -9 slurmctld                # last resort
+sudo systemctl start slurmctld
+```
+
+**Never** invoke `slurmctld -c` unless the state directory is corrupt — the `-c` flag purges
+all jobs and node states.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/scripts/slurm-diagnose-fix.sh b/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/scripts/slurm-diagnose-fix.sh
new file mode 100755
index 00000000..6d56cc6c
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/scripts/slurm-diagnose-fix.sh
@@ -0,0 +1,382 @@
+#!/usr/bin/env bash
+# slurm-diagnose-fix.sh
+#
+# Diagnose and (optionally) fix Slurm node-management issues on Amazon SageMaker HyperPod
+# Slurm clusters. Covers the three failure modes from the HyperPod Slurm troubleshooting
+# guide:
+#   A. Node DOWN / not responding
+#   B. Node marked DOWN with reason "Node unexpectedly rebooted"
+#   C. Jobs stuck PENDING / COMPLETING due to stale slurmctld state
+#
+# The script is SSM-first: it discovers the controller node from DescribeCluster, runs Slurm
+# commands there, and SSMs to any affected compute node to inspect slurmd. Default mode is
+# read-only inspection. --fix applies safe remediation (restart slurmd, resume a healthy
+# node). Restarting slurmctld always prompts for interactive confirmation first.
+#
+# Usage:
+#   bash slurm-diagnose-fix.sh --cluster <NAME-or-ARN> --region <REGION>
+#   bash slurm-diagnose-fix.sh --cluster <N> --region <R> --node <SLURM_NODE>
+#   bash slurm-diagnose-fix.sh --cluster <N> --region <R> --fix
+#   bash slurm-diagnose-fix.sh --cluster <N> --region <R> --fix --yes   # non-interactive
+
+set -euo pipefail
+
+CLUSTER=""
+REGION="${AWS_DEFAULT_REGION:-us-east-1}"
+TARGET_NODE=""
+APPLY_FIX=false
+ASSUME_YES=false
+USE_COLOR=true
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cluster)  CLUSTER="$2";     shift 2 ;;
+    --region)   REGION="$2";      shift 2 ;;
+    --node)     TARGET_NODE="$2"; shift 2 ;;
+    --fix)      APPLY_FIX=true;   shift ;;
+    --yes|-y)   ASSUME_YES=true;  shift ;;
+    --no-color) USE_COLOR=false;  shift ;;
+    -h|--help)
+      sed -n '2,20p' "$0" | sed 's/^# \{0,1\}//'
+      exit 0 ;;
+    *) echo "Unknown argument: $1" >&2; exit 1 ;;
+  esac
+done
+
+[[ -z "$CLUSTER" ]] && { echo "Error: --cluster is required" >&2; exit 1; }
+
+command -v aws >/dev/null 2>&1 || { echo "Error: aws CLI is required (v2 recommended)." >&2; exit 1; }
+command -v jq  >/dev/null 2>&1 || { echo "Error: jq is required. Install with your package manager." >&2; exit 1; }
+
+if "$USE_COLOR"; then
+  RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+  CYAN='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'
+else
+  RED=''; GREEN=''; YELLOW=''; CYAN=''; BOLD=''; NC=''
+fi
+
+section() { echo ""; printf "${BOLD}${CYAN}=== %s ===${NC}\n" "$1"; }
+ok()   { printf "  ${GREEN}[PASS]${NC} %s\n" "$1"; }
+warn() { printf "  ${YELLOW}[WARN]${NC} %s\n" "$1"; }
+bad()  { printf "  ${RED}[FAIL]${NC} %s\n" "$1"; }
+info() { printf "         %s\n" "$1"; }
+fix()  { printf "  ${CYAN}[FIX ]${NC} %s\n" "$1"; }
+
+# Interactive confirmation for high-risk actions. Returns 0 on yes, 1 on anything else.
+# --yes / -y flag bypasses. When stdin is not a TTY and --yes is not set, refuses the action.
+confirm() {
+  local prompt="$1"
+  if "$ASSUME_YES"; then
+    return 0
+  fi
+  if [[ ! -t 0 ]]; then
+    warn "not a TTY — refusing high-risk action (pass --yes to proceed non-interactively)"
+    return 1
+  fi
+  printf "${YELLOW}%s${NC} [y/N] " "$prompt"
+  local answer
+  read -r answer
+  [[ "$answer" =~ ^[Yy]([Ee][Ss])?$ ]]
+}
+
+ISSUES=()
+FIXES=()
+SKIPPED=()
+
+# --- Verify cluster + orchestrator --------------------------------------------
+section "1. Cluster identity"
+DESC=$(aws sagemaker describe-cluster --cluster-name "$CLUSTER" --region "$REGION" \
+  --output json 2>&1) || { bad "cannot describe cluster: $DESC"; exit 1; }
+
+ORCH=$(echo "$DESC" | jq -r '.Orchestrator | keys[0] // "slurm"')
+if [[ "$ORCH" == "Eks" ]]; then
+  bad "cluster uses EKS orchestrator - this skill is for Slurm only"
+  info "use hyperpod-node-debugger or hyperpod-nccl instead"
+  exit 1
+fi
+ok "Slurm cluster: $(echo "$DESC" | jq -r '.ClusterName')  status=$(echo "$DESC" | jq -r '.ClusterStatus')"
+
+# --- Find controller node -----------------------------------------------------
+NODES_JSON=$(aws sagemaker list-cluster-nodes --cluster-name "$CLUSTER" --region "$REGION" \
+  --output json 2>&1) || { bad "list-cluster-nodes failed: $NODES_JSON"; exit 1; }
+
+CONTROLLER_ID=$(echo "$NODES_JSON" | jq -r '
+  .ClusterNodeSummaries[]
+  | select(.InstanceGroupName|test("controller|head";"i"))
+  | .InstanceId' | head -1)
+
+if [[ -z "$CONTROLLER_ID" ]]; then
+  # Fallback: use the first instance group. Rare on Slurm clusters, but better than exiting.
+  CONTROLLER_ID=$(echo "$NODES_JSON" | jq -r '.ClusterNodeSummaries[0].InstanceId')
+  warn "no controller/head group found - using first node $CONTROLLER_ID as head"
+else
+  ok "controller node: $CONTROLLER_ID"
+fi
+
+CLUSTER_ID=$(echo "$DESC" | jq -r '.ClusterArn' | awk -F'/' '{print $NF}')
+CONTROLLER_GROUP=$(echo "$NODES_JSON" | jq -r --arg id "$CONTROLLER_ID" \
+  '.ClusterNodeSummaries[] | select(.InstanceId==$id) | .InstanceGroupName')
+SSM_HEAD="sagemaker-cluster:${CLUSTER_ID}_${CONTROLLER_GROUP}-${CONTROLLER_ID}"
+
+# Run a command on a HyperPod node via SSM; prints stdout, returns SSM exit code.
+# The command is passed to SSM as a JSON array built by jq — never string-interpolated into
+# the CLI payload — so embedded quotes / shell metacharacters in $cmd cannot break the JSON
+# or turn into an injection vector.
+ssm_run() {
+  local target="$1"; shift
+  local cmd="$*"
+  local params
+  params=$(jq -nc --arg c "$cmd" '{command: [$c]}')
+  aws ssm start-session --region "$REGION" --target "$target" \
+    --document-name AWS-StartNonInteractiveCommand \
+    --parameters "$params" 2>/dev/null \
+    | sed -e 's/\x1b\[[0-9;]*m//g' -e '/Starting session/d' -e '/Exiting session/d' || true
+}
+
+# --- Collect Slurm state from head node ---------------------------------------
+section "2. Slurm cluster state (from head node)"
+SINFO_OUT=$(ssm_run "$SSM_HEAD" "sinfo -h -o '%N|%T|%E' 2>&1 | head -200") || true
+if [[ -z "$SINFO_OUT" ]] || echo "$SINFO_OUT" | grep -qi 'command not found\|cannot connect'; then
+  bad "cannot run sinfo on head node - SSM or Slurm not ready"
+  info "try:  aws ssm start-session --target $SSM_HEAD --region $REGION"
+  info "then: sinfo  (install session-manager-plugin if SSM start-session fails)"
+  exit 1
+fi
+
+DOWN_NODES=()
+REBOOT_NODES=()
+while IFS='|' read -r node state reason; do
+  [[ -z "$node" ]] && continue
+  if echo "$state" | grep -qiE 'down|drain'; then
+    if echo "$reason" | grep -qi 'unexpectedly rebooted'; then
+      REBOOT_NODES+=("$node")
+    else
+      DOWN_NODES+=("$node|$reason")
+    fi
+  fi
+done <<< "$SINFO_OUT"
+
+if [[ ${#DOWN_NODES[@]} -eq 0 && ${#REBOOT_NODES[@]} -eq 0 ]]; then
+  ok "all nodes in healthy Slurm states"
+else
+  [[ ${#DOWN_NODES[@]} -gt 0 ]]   && bad "${#DOWN_NODES[@]} node(s) DOWN/DRAIN (Section A)"
+  [[ ${#REBOOT_NODES[@]} -gt 0 ]] && bad "${#REBOOT_NODES[@]} node(s) with 'unexpectedly rebooted' (Section B)"
+fi
+
+# --- Check controller health --------------------------------------------------
+section "3. slurmctld health"
+PING_OUT=$(ssm_run "$SSM_HEAD" "scontrol ping 2>&1")
+if echo "$PING_OUT" | grep -qi 'UP'; then
+  ok "slurmctld responding: $(echo "$PING_OUT" | tr '\n' ' ')"
+else
+  bad "slurmctld not responding: $PING_OUT"
+  ISSUES+=("controller-hung")
+fi
+
+# --- Check for stuck jobs -----------------------------------------------------
+section "4. Job queue health"
+SQUEUE_OUT=$(ssm_run "$SSM_HEAD" "squeue -h -o '%i|%T|%r' 2>&1 | head -200")
+STUCK_PENDING=0
+STUCK_COMPLETING=0
+while IFS='|' read -r jobid state reason; do
+  [[ -z "$jobid" ]] && continue
+  [[ "$state" == "PD" && "$reason" == "Resources" ]] && STUCK_PENDING=$((STUCK_PENDING+1))
+  [[ "$state" == "CG" ]] && STUCK_COMPLETING=$((STUCK_COMPLETING+1))
+done <<< "$SQUEUE_OUT"
+
+if [[ $STUCK_PENDING -gt 0 ]]; then
+  warn "$STUCK_PENDING job(s) PENDING with REASON=Resources"
+  [[ ${#DOWN_NODES[@]} -eq 0 ]] && ISSUES+=("stuck-pending-with-idle-nodes")
+fi
+[[ $STUCK_COMPLETING -gt 0 ]] && { bad "$STUCK_COMPLETING job(s) stuck in COMPLETING"; ISSUES+=("stuck-completing"); }
+[[ $STUCK_PENDING -eq 0 && $STUCK_COMPLETING -eq 0 ]] && ok "no stuck jobs"
+
+# --- Detect in-progress HyperPod replacements ---------------------------------
+# AWS docs: "Avoid changing the node state or restarting the Slurm controller during the
+# [replacement] operation." Refuse any controller restart while any node is in `fail` with
+# an Action:* reason, regardless of --yes.
+REPLACING_NODES=()
+while IFS='|' read -r node state reason; do
+  [[ -z "$node" ]] && continue
+  if echo "$state" | grep -qi 'fail' && echo "$reason" | grep -qi 'Action:'; then
+    REPLACING_NODES+=("$node")
+  fi
+done <<< "$SINFO_OUT"
+
+if [[ ${#REPLACING_NODES[@]} -gt 0 ]]; then
+  warn "HyperPod replacement/reboot in progress on: ${REPLACING_NODES[*]}"
+  info "controller restart will be refused until these nodes finish (per AWS docs)"
+fi
+
+# --- Per-node inspection ------------------------------------------------------
+inspect_node() {
+  local slurm_node="$1"
+  local instance_id group ssm_target
+  instance_id=$(echo "$NODES_JSON" | jq -r --arg dns "$slurm_node" '
+    .ClusterNodeSummaries[] | select(.PrivateDnsName|startswith($dns)) | .InstanceId' | head -1)
+  [[ -z "$instance_id" ]] && { warn "$slurm_node: cannot map to instance ID"; return; }
+  group=$(echo "$NODES_JSON" | jq -r --arg id "$instance_id" \
+    '.ClusterNodeSummaries[] | select(.InstanceId==$id) | .InstanceGroupName')
+  ssm_target="sagemaker-cluster:${CLUSTER_ID}_${group}-${instance_id}"
+
+  local slurmd_status disk mem
+  slurmd_status=$(ssm_run "$ssm_target" "systemctl is-active slurmd 2>&1" | tr -d '\n')
+  disk=$(ssm_run "$ssm_target" "df -h / | awk 'NR==2 {print \$5}'" | tr -d '\n')
+  mem=$(ssm_run "$ssm_target" "free -h | awk '/Mem:/ {print \$3\"/\"\$2}'" | tr -d '\n')
+
+  info "$slurm_node ($instance_id): slurmd=$slurmd_status disk=$disk mem=$mem"
+  echo "$slurm_node|$instance_id|$ssm_target|$slurmd_status|$disk"
+}
+
+NODE_DETAILS=()
+if [[ -n "$TARGET_NODE" ]]; then
+  section "5. Inspecting node: $TARGET_NODE"
+  d=$(inspect_node "$TARGET_NODE" | tail -1)
+  [[ -n "$d" ]] && NODE_DETAILS+=("$d")
+elif [[ ${#DOWN_NODES[@]} -gt 0 || ${#REBOOT_NODES[@]} -gt 0 ]]; then
+  section "5. Inspecting affected nodes"
+  for entry in "${DOWN_NODES[@]-}"; do
+    [[ -z "$entry" ]] && continue
+    d=$(inspect_node "${entry%%|*}" | tail -1)
+    [[ -n "$d" ]] && NODE_DETAILS+=("$d")
+  done
+  for n in "${REBOOT_NODES[@]-}"; do
+    [[ -z "$n" ]] && continue
+    d=$(inspect_node "$n" | tail -1)
+    [[ -n "$d" ]] && NODE_DETAILS+=("$d")
+  done
+fi
+
+# --- Apply fixes --------------------------------------------------------------
+section "6. Remediation"
+
+if ! "$APPLY_FIX"; then
+  info "(dry-run - re-run with --fix to apply safe remediation;"
+  info " --fix always prompts before the highest-risk actions)"
+fi
+
+# Section B: unexpectedly rebooted — this is the low-risk, routine recovery path.
+for n in "${REBOOT_NODES[@]-}"; do
+  [[ -z "$n" ]] && continue
+  fix "Section B: $n - ensure slurmd is running, then 'scontrol update state=resume'"
+  if "$APPLY_FIX"; then
+    # Find this node's SSM target
+    for d in "${NODE_DETAILS[@]-}"; do
+      [[ -z "$d" ]] && continue
+      IFS='|' read -r sn _ tgt sstatus _ <<< "$d"
+      [[ "$sn" == "$n" ]] || continue
+      if [[ "$sstatus" != "active" ]]; then
+        ssm_run "$tgt" "sudo systemctl enable slurmd && sudo systemctl start slurmd" >/dev/null
+      fi
+      ssm_run "$SSM_HEAD" "sudo scontrol update nodename=$n state=resume"
+      FIXES+=("resumed $n")
+      break
+    done
+  fi
+done
+
+# Section A: DOWN nodes — restart slurmd and resume if it comes back healthy.
+for entry in "${DOWN_NODES[@]-}"; do
+  [[ -z "$entry" ]] && continue
+  node="${entry%%|*}"; reason="${entry##*|}"
+  # Skip if disk is full - that needs human cleanup first
+  for d in "${NODE_DETAILS[@]-}"; do
+    [[ -z "$d" ]] && continue
+    IFS='|' read -r sn _ tgt sstatus disk_pct <<< "$d"
+    [[ "$sn" == "$node" ]] || continue
+    disk_num=$(echo "$disk_pct" | tr -d '%')
+    if [[ -n "$disk_num" && "$disk_num" -ge 95 ]]; then
+      warn "$node: disk $disk_pct - cleanup required before fix (see references/slurm-details.md)"
+      ISSUES+=("disk-full-$node")
+      SKIPPED+=("$node (disk $disk_pct full)")
+      continue 2
+    fi
+    if [[ "$sstatus" != "active" ]]; then
+      fix "Section A: $node - slurmd is '$sstatus', restart + resume"
+      if "$APPLY_FIX"; then
+        ssm_run "$tgt" "sudo systemctl restart slurmd" >/dev/null
+        sleep 3
+        new_status=$(ssm_run "$tgt" "systemctl is-active slurmd" | tr -d '\n')
+        if [[ "$new_status" == "active" ]]; then
+          ssm_run "$SSM_HEAD" "sudo scontrol update nodename=$node state=resume"
+          FIXES+=("restarted slurmd + resumed $node")
+        else
+          warn "$node: slurmd still not active after restart - escalate to hyperpod-node-debugger"
+        fi
+      fi
+    else
+      warn "$node: slurmd is active but Slurm shows DOWN (reason: $reason) - try resume"
+      fix "Section A: $node - 'scontrol update state=resume'"
+      if "$APPLY_FIX"; then
+        ssm_run "$SSM_HEAD" "sudo scontrol update nodename=$node state=resume"
+        FIXES+=("resumed $node")
+      fi
+    fi
+  done
+done
+
+# Section C: controller restart — high-risk, always prompts.
+CTRL_RESTART_REASON=""
+ISSUES_STR=" ${ISSUES[*]-} "
+if [[ "$ISSUES_STR" == *" controller-hung "* ]];               then CTRL_RESTART_REASON="scontrol ping failed"; fi
+if [[ "$ISSUES_STR" == *" stuck-completing "* ]];              then CTRL_RESTART_REASON="jobs stuck COMPLETING"; fi
+if [[ "$ISSUES_STR" == *" stuck-pending-with-idle-nodes "* ]]; then CTRL_RESTART_REASON="jobs PENDING with idle nodes"; fi
+
+if [[ -n "$CTRL_RESTART_REASON" ]]; then
+  fix "Section C: restart slurmctld suggested ($CTRL_RESTART_REASON)"
+  info "A restart is expected to preserve running jobs, pending queue, and node states"
+  info "(all read back from StateSaveLocation on disk)."
+
+  if [[ ${#REPLACING_NODES[@]} -gt 0 ]]; then
+    warn "REFUSING to restart slurmctld — ${#REPLACING_NODES[@]} node(s) mid-replace"
+    info "per AWS docs: 'Avoid ... restarting the Slurm controller during the operation'"
+    info "wait for replacement to finish, then re-run this script"
+    SKIPPED+=("slurmctld restart (replacement in progress)")
+  elif "$APPLY_FIX"; then
+    if confirm "Restart slurmctld on $CONTROLLER_ID now?"; then
+      ssm_run "$SSM_HEAD" "sudo systemctl restart slurmctld"
+      sleep 5
+      new_ping=$(ssm_run "$SSM_HEAD" "scontrol ping 2>&1" | tr '\n' ' ')
+      if echo "$new_ping" | grep -qi 'UP'; then
+        FIXES+=("restarted slurmctld")
+        ok "slurmctld back up: $new_ping"
+      else
+        warn "slurmctld still not responding: $new_ping"
+        info "verify StateSaveLocation, then try: sudo pkill -9 slurmctld && sudo systemctl start slurmctld"
+        info "if that also fails, collect hyperpod-issue-report and contact AWS Support"
+      fi
+    else
+      info "skipped — run manually on $CONTROLLER_ID when ready:"
+      info "  sudo systemctl restart slurmctld"
+      SKIPPED+=("slurmctld restart (user declined)")
+    fi
+  fi
+fi
+
+# --- Summary ------------------------------------------------------------------
+section "Summary"
+echo "  Issues detected: ${#ISSUES[@]-0}"
+echo "  Fixes applied:   ${#FIXES[@]-0}"
+echo "  Skipped:         ${#SKIPPED[@]-0}"
+if [[ ${#ISSUES[@]-0} -eq 0 ]]; then
+  ok "cluster Slurm state is healthy"
+fi
+
+if [[ ${#FIXES[@]-0} -gt 0 ]]; then
+  echo ""
+  echo "  Applied:"
+  printf "    - %s\n" "${FIXES[@]-}"
+  echo ""
+  info "re-run without --fix to confirm issues are cleared"
+fi
+
+if [[ ${#SKIPPED[@]-0} -gt 0 ]]; then
+  echo ""
+  echo "  Skipped:"
+  printf "    - %s\n" "${SKIPPED[@]-}"
+fi
+
+if ! "$APPLY_FIX" && [[ ${#ISSUES[@]-0} -gt 0 ]]; then
+  echo ""
+  info "run with --fix to apply safe remediation (controller restart still prompts)"
+fi
diff --git a/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh b/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
index 03f5cb28..27ba88a3 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
+++ b/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
@@ -31,10 +31,11 @@ while [[ $# -gt 0 ]]; do
 done
 
 # --- Color setup ---
+
 if [[ "$USE_COLOR" == "true" ]] && [ -t 1 ] && [[ "$JSON_OUTPUT" != "true" ]]; then
-    RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
+    GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
 else
-    RED=''; GREEN=''; YELLOW=''; BLUE=''; NC=''
+    GREEN=''; YELLOW=''; BLUE=''; NC=''
 fi
 
 # --- Output file ---
@@ -74,9 +75,10 @@ else
     fi
 fi
 IS_NEURON=false
-IS_GPU=false
 [[ "$INSTANCE_TYPE" =~ (^|\.)(trn|inf) ]] && IS_NEURON=true
-[[ "$INSTANCE_TYPE" =~ (^|\.)(p[0-9]|g[0-9]) ]] && IS_GPU=true
+# GPU detection is driven by `cmd_exists nvidia-smi` at each GPU section below —
+# no explicit IS_GPU flag needed. Keeps GPU checks working on instances where
+# the driver is present but the regex would miss (e.g. new p-family SKUs).
 
 # JSON-safe string escape via jq (handles all special/unicode characters correctly)
 json_escape() { jq -rn --arg v "$1" '$v | @json | .[1:-1]'; }