ai-performance-engineering/code/setup.sh at main · cfregly/ai-performance-engineering · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/bash

#
# AI Performance Engineering Setup Script
# ========================================
#
# This script installs EVERYTHING you need:
#   1. NVIDIA Driver 580.126.09 (auto-upgrades if needed; open kernel modules for B200)
#   2. Python 3.12 (PyTorch 2.10-dev compatible)
#   3. CUDA 13.0.2 toolkit + cuBLAS 13.1.0.3 (Update 2) repository
#   4. Environment for PyTorch 2.10-dev source build with CUDA 13.0.2
#   5. NVIDIA Nsight Systems 2025.3.2 (for timeline profiling)
#   6. NVIDIA Nsight Compute 2025.3.1 (for kernel metrics)
#   7. All Python dependencies from requirements_latest.txt
#   8. System tools (numactl, perf, htop, etc.)
#   9. Configures NVIDIA drivers for profiling
#
# Requirements:
#   - Ubuntu 22.04+ (tested on 22.04)
#   - NVIDIA B200/B300 GPU (or compatible)
#   - sudo/root access
#   - Internet connection
#
# Usage:
#   sudo ./setup.sh
#   (logs to ./setup.log by default; override with SETUP_LOG_FILE)
#
# Duration: 10-20 minutes (first run may require reboot for driver upgrade)
#
# What it does:
#   - Adds official NVIDIA CUDA 13.0 (Update 2) repository
#   - Configures APT to prefer official NVIDIA packages
#   - Fixes Python APT module (python3-apt) compatibility
#   - Disables problematic command-not-found APT hook
#   - Removes duplicate deadsnakes repository entries
#   - Upgrades Python to 3.12 (required by PyTorch 2.10 dev builds)
#   - Auto-upgrades NVIDIA driver to 580+ if needed (will prompt reboot)
#   - Installs CUDA 13.0.2 toolkit and libraries
#   - Installs latest Nsight tools (2025.x)
#   - Prepares for PyTorch 2.10-dev (source build) with CUDA 13.0.2
#   - Removes conflicting system packages (python3-optree, etc.)
#   - Installs nvidia-ml-py (replaces deprecated pynvml)
#   - Configures NVIDIA kernel modules for profiling
#   - Fixes hardware info script compatibility
#   - Runs validation tests
#
# Notes:
#   - If driver upgrade is needed, script will exit and ask you to reboot
#   - After reboot, simply re-run: sudo ./setup.sh
#   - The script is idempotent and safe to re-run
#   - Disk cleanup is enabled by default (CLEAN_APT_CACHE/CLEAN_PIP_CACHE/CLEAN_BUILD_ARTIFACTS)
#   - Minimum free space thresholds: SETUP_MIN_FREE_GB / SETUP_MIN_TE_BUILD_FREE_GB
#
# After running this script, you can:
#   - Run examples: python3 ch01/performance_basics.py
#   - Drive the benchmark suite: python -m cli.aisp bench run
#   - Capture peak performance: python core/benchmark/benchmark_peak.py
#

set -e  # Exit on any error

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="${SCRIPT_DIR}"

# =============================================================================
# Logging + Disk Hygiene (always-on logging, keep disk usage tidy)
# =============================================================================
LOG_FILE="${SETUP_LOG_FILE:-${PROJECT_ROOT}/setup.log}"
LOG_MAX_BYTES="${SETUP_LOG_MAX_BYTES:-104857600}"  # 100 MB
LOG_COMPRESS="${SETUP_LOG_COMPRESS:-1}"
SETUP_MIN_FREE_GB="${SETUP_MIN_FREE_GB:-6}"
SETUP_MIN_TE_BUILD_FREE_GB="${SETUP_MIN_TE_BUILD_FREE_GB:-8}"
CLEAN_APT_CACHE="${CLEAN_APT_CACHE:-1}"
CLEAN_PIP_CACHE="${CLEAN_PIP_CACHE:-1}"
CLEAN_BUILD_ARTIFACTS="${CLEAN_BUILD_ARTIFACTS:-1}"

rotate_setup_log() {
    if [ -f "${LOG_FILE}" ]; then
        local log_size
        log_size="$(stat -c %s "${LOG_FILE}" 2>/dev/null || echo 0)"
        if [ "${log_size}" -ge "${LOG_MAX_BYTES}" ]; then
            local ts rotated
            ts="$(date +%Y%m%d_%H%M%S)"
            rotated="${LOG_FILE}.${ts}"
            mv "${LOG_FILE}" "${rotated}" 2>/dev/null || true
            if [ "${LOG_COMPRESS}" -eq 1 ] && command -v gzip >/dev/null 2>&1; then
                gzip -f "${rotated}" >/dev/null 2>&1 || true
            fi
        fi
    fi
}

start_setup_logging() {
    local log_dir
    log_dir="$(dirname "${LOG_FILE}")"
    mkdir -p "${log_dir}"
    rotate_setup_log
    if [ "$(id -u)" -eq 0 ] && [ -n "${SUDO_USER:-}" ]; then
        touch "${LOG_FILE}" 2>/dev/null || true
        chown "${SUDO_USER}":"${SUDO_USER}" "${LOG_FILE}" 2>/dev/null || true
    fi
    exec > >(tee -a "${LOG_FILE}") 2>&1
    echo "Logging to ${LOG_FILE}"
}

get_free_kb() {
    df -Pk "${PROJECT_ROOT}" | awk 'NR==2 {print $4}'
}

reclaim_disk_space_basic() {
    echo "Reclaiming disk space (APT + pip caches)..."
    if [ "${CLEAN_APT_CACHE}" -eq 1 ] && [ "$(id -u)" -eq 0 ]; then
        apt-get clean >/dev/null 2>&1 || true
        rm -rf /var/lib/apt/lists/* >/dev/null 2>&1 || true
    fi
    if [ "${CLEAN_PIP_CACHE}" -eq 1 ]; then
        python3 -m pip cache purge >/dev/null 2>&1 || true
        rm -rf /root/.cache/pip >/dev/null 2>&1 || true
    fi
}

ensure_free_space_gb() {
    local min_gb="$1"
    local reason="$2"
    local free_kb free_gb
    free_kb="$(get_free_kb)"
    free_gb=$((free_kb / 1024 / 1024))
    if [ "${free_gb}" -lt "${min_gb}" ]; then
        echo "Low disk space: ${free_gb} GB free (need ${min_gb} GB) for ${reason}."
        reclaim_disk_space_basic
        free_kb="$(get_free_kb)"
        free_gb=$((free_kb / 1024 / 1024))
        if [ "${free_gb}" -lt "${min_gb}" ]; then
            echo "ERROR: Insufficient disk space after cleanup (${free_gb} GB free)."
            echo "       Free up space or set SETUP_MIN_FREE_GB/SETUP_MIN_TE_BUILD_FREE_GB."
            exit 1
        fi
    fi
}

start_setup_logging
ensure_free_space_gb "${SETUP_MIN_FREE_GB}" "setup start"

echo "AI Performance Engineering Setup Script"
echo "=========================================="
echo "This script will install:"
echo "  • NVIDIA Driver 580.126.09 (auto-upgrade if needed)"
echo "  • Python 3.12 (PyTorch 2.10-dev compatible)"
echo "  • CUDA 13.0.2 toolkit + cuBLAS 13.1.0.3 (Update 2) repository"
echo "  • Environment configured for PyTorch 2.10-dev source build"
echo "  • NVIDIA Nsight Systems 2025.3.2 (latest)"
echo "  • NVIDIA Nsight Compute 2025.3.1 (latest)"
echo "  • All project dependencies"
echo "  • System tools (numactl, perf, etc.)"
echo ""
echo "Note: If driver upgrade is needed, you'll be prompted to reboot."
echo ""

REQUIRED_DRIVER_VERSION="580.126.09"
PYTHON_TARGET_VERSION="3.12"
PYTHON_TARGET_MAJOR="${PYTHON_TARGET_VERSION%%.*}"
PYTHON_TARGET_MINOR="${PYTHON_TARGET_VERSION##*.}"
PYTHON_TARGET_BIN="python${PYTHON_TARGET_VERSION}"
PYTHON_ABI_TAG="cp${PYTHON_TARGET_MAJOR}${PYTHON_TARGET_MINOR}"
PYTHON_DIST_PACKAGES="/usr/local/lib/python${PYTHON_TARGET_VERSION}/dist-packages"
CUDA_SHORT_VERSION="13.0"
CUDA_FULL_VERSION="13.0.2"
# cuBLAS is pinned to 13.1.0.3 (Update 2) via cuda-libraries; cuDNN latest in repo
CUDNN_VERSION="9.16.0.29"
NCCL_SHORT_VERSION="2.28.7"
CUDA_HOME_DIR="/usr/local/cuda-${CUDA_SHORT_VERSION}"
THIRD_PARTY_DIR="${PROJECT_ROOT}/third_party"
mkdir -p "${THIRD_PARTY_DIR}"
FLASH_ATTN_TAG="${FLASH_ATTN_TAG:-v2.8.3}"
FLASH_ATTN_ARCH="$(uname -m)"
if [ "${FLASH_ATTN_ARCH}" = "arm64" ]; then
    FLASH_ATTN_ARCH="aarch64"
fi
FLASH_ATTN_WHEEL_BASENAME="flash_attn-2.8.3-${PYTHON_ABI_TAG}-${PYTHON_ABI_TAG}-linux_${FLASH_ATTN_ARCH}.whl"
FLASH_ATTN_EXPECTED_VERSION="${FLASH_ATTN_TAG#v}"
detect_default_sm() {
    if [ -n "${GPU_COMPUTE_SM_NUM:-}" ]; then
        echo "${GPU_COMPUTE_SM_NUM}"
        return
    fi
    if command -v nvidia-smi >/dev/null 2>&1; then
        local cap
        cap="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n 1 | tr -d '[:space:]')"
        if [ -n "${cap}" ]; then
            local major="${cap%.*}"
            local minor="${cap#*.}"
            printf "%s%s\n" "${major}" "${minor}"
            return
        fi
    fi
    echo "121"
}
FLASH_ATTENTION_FORCE_CUDA_SM_VALUE="${FLASH_ATTENTION_FORCE_CUDA_SM_VALUE:-$(detect_default_sm)}"
VLLM_REPO_URL="${VLLM_REPO_URL:-https://github.com/vllm-project/vllm.git}"
VLLM_VERSION_TAG="${VLLM_VERSION_TAG:-main}"
INSTALL_GPT_OSS="${INSTALL_GPT_OSS:-0}"
VLLM_GIT_REF="${VLLM_GIT_REF:-${VLLM_VERSION_TAG}}"
VLLM_SRC_DIR="${VLLM_SRC_DIR:-${THIRD_PARTY_DIR}/vllm-src}"
VLLM_WHEEL_DIR="${THIRD_PARTY_DIR}/wheels"
VLLM_WHEEL_INFO_PATH="${VLLM_WHEEL_INFO_PATH:-${VLLM_WHEEL_DIR}/vllm-build-info.json}"
VLLM_WHEEL_ARCH="$(uname -m)"
VLLM_EXTRA_INDEX_URL="${VLLM_EXTRA_INDEX_URL:-https://wheels.vllm.ai/cu130}"
VLLM_PIP_SPEC="${VLLM_PIP_SPEC:-vllm==0.16.0}"
FLASHINFER_EXPECTED_VERSION="${FLASHINFER_EXPECTED_VERSION:-0.6.3}"
TORCHTITAN_TOMLI_VERSION="${TORCHTITAN_TOMLI_VERSION:-2.4.0}"
TORCHTITAN_TYRO_VERSION="${TORCHTITAN_TYRO_VERSION:-1.0.10}"
TORCHTITAN_RUNTIME_DEPS=(
    "tomli==${TORCHTITAN_TOMLI_VERSION}"
    "tyro==${TORCHTITAN_TYRO_VERSION}"
)
VLLM_RUNTIME_DEPS=(
    "cbor2==5.8.0"
    "msgspec==0.20.0"
    "gguf==0.18.0"
    "ijson==3.5.0"
    "pybase64==1.4.3"
    "setproctitle==1.3.7"
    "diskcache==5.6.3"
    "partial-json-parser==0.2.1.1.post7"
    "lm-format-enforcer==0.11.3"
    "outlines_core==0.2.11"
    "llguidance==1.3.0"
    "xgrammar==0.1.29"
    "compressed-tensors==0.13.0"
    "depyf==0.20.0"
    "watchfiles==1.1.1"
    "blake3==1.0.8"
    "anthropic==0.84.0"
    "openai==2.24.0"
    "openai-harmony==0.0.8"
    "model-hosting-container-standards==0.1.13"
    "mcp==1.26.0"
    "grpcio-reflection==1.78.0"
)
if [ "${VLLM_WHEEL_ARCH}" = "arm64" ]; then
    VLLM_WHEEL_ARCH="aarch64"
fi
VLLM_WHEEL_PATTERN="${VLLM_WHEEL_PATTERN:-${VLLM_WHEEL_DIR}/vllm-*-${PYTHON_ABI_TAG}-${PYTHON_ABI_TAG}-linux_${VLLM_WHEEL_ARCH}.whl}"
# =============================================================================
# DEPENDENCY VERSION PINS (update together, test after changes)
# Run: python core/scripts/check_upstream_versions.py --check-te-cutlass
# Source of truth for pinned versions lives here (dependency_versions.json removed)
# =============================================================================
#
# CUTLASS 4.3.0 - Required for SM100a (Blackwell) support
#   - Provides: tmem_allocator_sm100.hpp, mma_sm100_umma.hpp, copy_traits_sm100.hpp
#   - Commit e67e63c331d6 is post-release with corrected version.h
#
# TransformerEngine v2.9 - Stable release with CUDA 13 wheels
#   - IMPORTANT: TE v2.9 still bundles CUTLASS 4.2.0 (commit 57e3cfb47a2d)
#   - CUTLASS 4.2.0 LACKS SM100a headers - symlink workaround REQUIRED
#   - We replace TE's bundled CUTLASS with our 4.3.0 via symlink
#   - Check: make verify-cutlass
#
# When to remove symlink workaround:
#   - When TE bundles CUTLASS >= 4.3.0 with SM100a headers
#   - Run: python core/scripts/check_upstream_versions.py --check-te-cutlass
#   - If "TE main bundles: CUTLASS 4.3.0+" appears, symlink may be removable
#
TE_REPO_URL="${TE_REPO_URL:-https://github.com/NVIDIA/TransformerEngine.git}"
# TE v2.9 release (2025-11-11) - stable release with CUDA 13 support
TE_GIT_COMMIT="${TE_GIT_COMMIT:-v2.9}"
TE_VERSION="v2.9"
TE_BUNDLED_CUTLASS_VERSION="4.2.0"  # What TE bundles (needs symlink override)
TE_SRC_DIR="${TE_SRC_DIR:-${THIRD_PARTY_DIR}/TransformerEngine-src}"
CUTLASS_REPO_URL="${CUTLASS_REPO_URL:-https://github.com/NVIDIA/cutlass.git}"
# CUTLASS 4.3.0 release tag
CUTLASS_REF="${CUTLASS_REF:-v4.3.0}"
CUTLASS_TARGET_VERSION="${CUTLASS_TARGET_VERSION:-4.3.0}"
CUTLASS_SRC_DIR="${CUTLASS_SRC_DIR:-${THIRD_PARTY_DIR}/cutlass}"
PIP_ROOT_USER_ACTION="ignore"
SOURCE_BUILD_ALLOWED=0
GPU_COMPUTE_SM_NUM=""
VLLM_PREBUILT_INSTALLED=0
export PROJECT_ROOT REQUIRED_DRIVER_VERSION PYTHON_TARGET_VERSION PYTHON_TARGET_MAJOR PYTHON_TARGET_MINOR PYTHON_TARGET_BIN PYTHON_ABI_TAG PYTHON_DIST_PACKAGES PIP_ROOT_USER_ACTION

if command -v git >/dev/null 2>&1; then
    git config --global --add safe.directory "${PROJECT_ROOT}" >/dev/null 2>&1 || true
    if [ -d "${PROJECT_ROOT}/vendor/pytorch-src" ]; then
        git config --global --add safe.directory "${PROJECT_ROOT}/vendor/pytorch-src" >/dev/null 2>&1 || true
    fi
    if git -C "${PROJECT_ROOT}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
        if [ -f "${PROJECT_ROOT}/.gitmodules" ]; then
            git -C "${PROJECT_ROOT}" submodule sync --recursive >/dev/null 2>&1 || true
            git -C "${PROJECT_ROOT}" submodule update --init --recursive >/dev/null 2>&1 || true
        fi
    fi
fi
PYTORCH_REPO_URL="${PYTORCH_REPO_URL:-https://github.com/pytorch/pytorch.git}"
PYTORCH_COMMIT="${PYTORCH_COMMIT:-main}"
PYTORCH_SRC_DIR="${PYTORCH_SRC_DIR:-${THIRD_PARTY_DIR}/pytorch-src}"
PYTORCH_BUILD_DIR="${PYTORCH_SRC_DIR}"
PYTORCH_DIST_DIR="${PYTORCH_BUILD_DIR}/dist"
PYTORCH_WHEEL_DIR="${THIRD_PARTY_DIR}/wheels"
PYTORCH_WHEEL_PATTERN="${PYTORCH_WHEEL_PATTERN:-torch-*-${PYTHON_ABI_TAG}-${PYTHON_ABI_TAG}-*.whl}"
mkdir -p "${PYTORCH_WHEEL_DIR}"

TORCH_CUDA_ARCH_LIST_VALUE="10.0;10.3;12.1;12.2+PTX"
CMAKE_CUDA_ARCH_LIST_VALUE="100;103;121;122"
TORCH_SM_ARCH_LIST_VALUE="sm_100;sm_103;sm_121;sm_122"
CUTLASS_NVCC_ARCHS_VALUE_DEFAULT="100;103;121;122"
CUTLASS_NVCC_ARCHS_VALUE="${CUTLASS_NVCC_ARCHS_VALUE_DEFAULT}"
PYTORCH_NIGHTLY_DATE="20251213"
PYTORCH_TORCH_VERSION="2.10.0.dev${PYTORCH_NIGHTLY_DATE}+cu130"
# PYTORCH_TORCHVISION_VERSION="0.25.0.dev${PYTORCH_NIGHTLY_DATE}+cu130"
PYTORCH_TORCHAUDIO_VERSION="2.10.0.dev${PYTORCH_NIGHTLY_DATE}+cu130"
PYTORCH_TORCHAO_VERSION="0.16.0.dev${PYTORCH_NIGHTLY_DATE}+cu130"
PYTORCH_TRITON_VERSION="3.6.0+git8fedd49b"
PYTORCH_NIGHTLY_INDEX="https://download.pytorch.org/whl/nightly"
PYTORCH_CU130_INDEX_ROOT="https://download.pytorch.org/whl/nightly/cu130"
PYTORCH_CU130_INDEX="${PYTORCH_CU130_INDEX_ROOT}"
PYTORCH_TORCH_FIND_LINKS="${PYTORCH_TORCH_FIND_LINKS:-https://download.pytorch.org/whl/nightly/cu130/torch/}"
GPU_CLOCK_SERVICE_PATH="/etc/systemd/system/gpu-clock-pin.service"
echo "Project root: $PROJECT_ROOT"
cd "$PROJECT_ROOT"

# Check if running as root
if [[ $EUID -eq 0 ]]; then
   echo "Running as root. This is fine for containerized environments."
else
   echo "This script requires root privileges. Please run with sudo."
   exit 1
fi

lock_gpu_clocks_if_supported() {
    # Best-effort: lock SM clocks to the max supported value to reduce run-to-run noise.
    # Skips silently if not supported or if nvidia-smi is unavailable.
    if ! command -v nvidia-smi >/dev/null 2>&1; then
        return
    fi
    local gpu_name sm_max mem_max
    gpu_name="$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -n 1 | tr -d '\r')"
    sm_max="$(nvidia-smi --query-gpu=clocks.max.sm --format=csv,noheader 2>/dev/null | head -n 1 | awk '{print $1}')"
    mem_max="$(nvidia-smi --query-gpu=clocks.max.mem --format=csv,noheader 2>/dev/null | head -n 1 | awk '{print $1}')"

    if [ -z "${sm_max}" ]; then
        echo "Skipping clock lock: sm_max not available from nvidia-smi."
        return
    fi

    echo "Attempting to lock SM clocks for ${gpu_name:-GPU} to ${sm_max} MHz (best-effort; may require admin privileges)..."
    if nvidia-smi -lgc "${sm_max},${sm_max}" >/dev/null 2>&1; then
        echo "  Locked SM clocks to ${sm_max} MHz."
    else
        echo "  SM clock lock not supported on this GPU/driver; continuing without lock."
    fi

    if [ -n "${mem_max}" ]; then
        if nvidia-smi --lock-memory-clocks="${mem_max},${mem_max}" >/dev/null 2>&1; then
            echo "  Locked memory clocks to ${mem_max} MHz."
        elif nvidia-smi --lock-memory-clocks="${mem_max}" >/dev/null 2>&1; then
            echo "  Locked memory clocks to ${mem_max} MHz (single-value interface)."
        elif nvidia-smi --lock-memory-clocks-deferred="${mem_max}" >/dev/null 2>&1; then
            echo "  Deferred lock of memory clocks to ${mem_max} MHz (takes effect after driver reload)."
        else
            echo "  Memory clock lock not supported here; continuing without lock."
        fi
    fi
}

install_gpu_clock_service() {
    # Install a systemd unit to reapply clock locks on boot.
    if ! command -v systemctl >/dev/null 2>&1; then
        echo "systemctl not available; skipping GPU clock service install."
        return
    fi

    cat > "${GPU_CLOCK_SERVICE_PATH}" <<'EOF'
[Unit]
Description=Pin GPU clocks for stability
After=multi-user.target
ConditionPathExists=/usr/bin/nvidia-smi

[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/bin/bash -lc '\
  sm=$(nvidia-smi --query-gpu=clocks.max.sm --format=csv,noheader | head -n1 | awk "{print \$1}") || exit 0; \
  mem=$(nvidia-smi --query-gpu=clocks.max.mem --format=csv,noheader | head -n1 | awk "{print \$1}") || exit 0; \
  nvidia-smi -pm 1 || true; \
  nvidia-smi -lgc $${sm},$${sm} || true; \
  nvidia-smi -ac $${mem},$${sm} || true; \
  nvidia-smi --lock-memory-clocks-deferred=$${mem} || true; \
'

[Install]
WantedBy=multi-user.target
EOF

    systemctl daemon-reload || true
    systemctl enable --now gpu-clock-pin.service || true
}

apply_deferred_memory_lock_now() {
    # Try to apply a deferred memory clock lock without a reboot.
    # This is best-effort and will fail if GPU reset isn't supported or processes are running.
    if ! command -v nvidia-smi >/dev/null 2>&1; then
        return
    fi
    if nvidia-smi --gpu-reset >/dev/null 2>&1; then
        echo "Applied deferred memory clock lock via nvidia-smi --gpu-reset."
    else
        echo "INFO: Could not trigger deferred memory lock without reboot; it will apply on next driver reload/boot."
    fi
}

# Lock GPU clocks (best-effort) to reduce perf variance; safe to skip if unsupported.
lock_gpu_clocks_if_supported
install_gpu_clock_service
apply_deferred_memory_lock_now

pip_cmd() {
    if [ -z "${PIP_SUPPORTS_BREAK_SYSTEM_PACKAGES:-}" ]; then
        if python3 -m pip --help 2>&1 | grep -q -- '--break-system-packages'; then
            PIP_SUPPORTS_BREAK_SYSTEM_PACKAGES=1
        else
            PIP_SUPPORTS_BREAK_SYSTEM_PACKAGES=0
        fi
    fi

    if [ "${PIP_SUPPORTS_BREAK_SYSTEM_PACKAGES}" -eq 1 ]; then
        python3 -m pip --break-system-packages "$@"
    else
        PIP_BREAK_SYSTEM_PACKAGES=1 python3 -m pip "$@"
    fi
}

pip_install() {
    pip_cmd install "$@"
}

pip_uninstall() {
    pip_cmd uninstall "$@"
}

pip_wheel() {
    pip_cmd wheel "$@"
}

pip_show() {
    pip_cmd show "$@"
}

# Ensure a tool is reachable by adding a symlink in /usr/local/bin if found elsewhere.
ensure_tool_on_path() {
    local tool_name="$1"
    shift
    local patterns=("$@")
    local nullglob_state
    nullglob_state="$(shopt -p nullglob || true)"
    shopt -s nullglob
    for pattern in "${patterns[@]}"; do
        for candidate in $pattern; do
            local target="$candidate"
            if [ -d "$target" ] && [ -x "${target}/${tool_name}" ]; then
                target="${target}/${tool_name}"
            fi
            if [ -x "$target" ]; then
                ln -sf "$target" "/usr/local/bin/${tool_name}"
                echo "Ensured ${tool_name} is on PATH via /usr/local/bin/${tool_name} -> ${target}"
                if [ -n "$nullglob_state" ]; then
                    eval "$nullglob_state"
                else
                    shopt -u nullglob
                fi
                return 0
            fi
        done
    done
    if [ -n "$nullglob_state" ]; then
        eval "$nullglob_state"
    else
        shopt -u nullglob
    fi
    return 1
}

# Reusable function to reassemble split wheels
reassemble_split_wheel() {
    local wheel_path="$1"
    local tmp_dir="${2:-$(mktemp -d "${TMPDIR:-/tmp}/wheel-reassemble.XXXXXX")}"

    # If full wheel exists, return it
    if [ -f "${wheel_path}" ]; then
        echo "${wheel_path}"
        return 0
    fi

    # Check for split parts
    if compgen -G "${wheel_path}.part*" >/dev/null 2>&1; then
        local combined="${tmp_dir}/$(basename "${wheel_path}")"
        mapfile -t PARTS < <(ls "${wheel_path}".part* | sort -V)
        if cat "${PARTS[@]}" > "${combined}" 2>/dev/null; then
            echo "${combined}"
            return 0
        fi
    fi

    return 1
}

# Reusable function to verify PyTorch CUDA and restore if needed
verify_and_restore_pytorch_cuda() {
    local context="$1"
    python3 <<'PY'
import sys
import torch
if not torch.cuda.is_available():
    print("ERROR: PyTorch CUDA not available")
    print(f"  torch.__version__ = {torch.__version__}")
    print(f"  torch.version.cuda = {torch.version.cuda}")
    sys.exit(1)
PY
    if [ $? -ne 0 ]; then
        echo "CRITICAL: PyTorch CUDA was not available during ${context}!"
        return 1
    fi
    return 0
}

ensure_wheel_root_pure() {
    local wheel_path="$1"
    if [ -z "${wheel_path}" ] || [ ! -f "${wheel_path}" ]; then
        return 0
    fi
    python3 - "${wheel_path}" <<'PY'
import shutil
import sys
import tempfile
import zipfile
from pathlib import Path

wheel_path = Path(sys.argv[1])
if not wheel_path.exists():
    raise SystemExit(0)

with zipfile.ZipFile(wheel_path, "r") as src:
    wheel_entries = [name for name in src.namelist() if name.endswith(".dist-info/WHEEL")]
    if not wheel_entries:
        raise SystemExit(0)
    with tempfile.TemporaryDirectory(dir=str(wheel_path.parent)) as tmpdir:
        tmp_path = Path(tmpdir) / wheel_path.name
        with zipfile.ZipFile(tmp_path, "w") as dst:
            for info in src.infolist():
                data = src.read(info.filename)
                if info.filename in wheel_entries:
                    text = data.decode("utf-8").splitlines()
                    for idx, line in enumerate(text):
                        if line.startswith("Root-Is-Purelib:"):
                            if line.strip().lower() != "root-is-purelib: false":
                                text[idx] = "Root-Is-Purelib: false"
                            break
                    else:
                        text.append("Root-Is-Purelib: false")
                    data = ("\n".join(text) + "\n").encode("utf-8")
                dst.writestr(info, data)
    shutil.move(tmp_path, wheel_path)
PY
}

patch_installed_transformer_engine_metadata() {
    python3 <<'PY'
import importlib.metadata as metadata
from importlib.metadata import PackageNotFoundError

def patch_distribution(name: str) -> None:
    try:
        dist = metadata.distribution(name)
    except PackageNotFoundError:
        return

    files = dist.files or []
    wheel_entry = None
    for file in files:
        if file.name == "WHEEL":
            wheel_entry = dist.locate_file(file)
            break

    if not wheel_entry:
        return

    lines = wheel_entry.read_text().splitlines()
    for idx, line in enumerate(lines):
        if line.startswith("Root-Is-Purelib:"):
            if line.strip().lower() != "root-is-purelib: false":
                lines[idx] = "Root-Is-Purelib: false"
            break
    else:
        lines.append("Root-Is-Purelib: false")

    wheel_entry.write_text("\n".join(lines) + "\n")

for dist_name in ("transformer_engine", "transformer_engine_torch", "transformer_engine_cu12"):
    patch_distribution(dist_name)
PY
}

patch_transformer_engine_loader() {
    python3 <<'PY'
from importlib.metadata import PackageNotFoundError, distribution
from pathlib import Path

OLD_BLOCK = '''    if te_framework_installed:
        assert te_installed_via_pypi, "Could not find `transformer-engine` PyPI package."
        assert te_core_installed, "Could not find TE core package `transformer-engine-cu*`."

        assert version(module_name) == version("transformer-engine") == te_core_version, (
            "Transformer Engine package version mismatch. Found"
            f" {module_name} v{version(module_name)}, transformer-engine"
            f" v{version('transformer-engine')}, and {te_core_package_name}"
            f" v{te_core_version}. Install transformer-engine using "
            f"'pip3 install --no-build-isolation transformer-engine[{extra_dep_name}]==VERSION'"
        )
'''

NEW_BLOCK = '''    if te_framework_installed:
        if te_installed_via_pypi and te_core_installed:
            assert version(module_name) == version("transformer-engine") == te_core_version, (
                "Transformer Engine package version mismatch. Found"
                f" {module_name} v{version(module_name)}, transformer-engine"
                f" v{version('transformer-engine')}, and {te_core_package_name}"
                f" v{te_core_version}. Install transformer-engine using "
                f"'pip3 install --no-build-isolation transformer-engine[{extra_dep_name}]==VERSION'"
            )
        else:
            pass
'''

patched = False
for dist_name in ("transformer_engine", "transformer-engine"):
    try:
        dist = distribution(dist_name)
    except PackageNotFoundError:
        continue
    path = Path(dist.locate_file("transformer_engine/common/__init__.py"))
    if not path.exists():
        continue
    text = path.read_text()
    if OLD_BLOCK in text:
        text = text.replace(OLD_BLOCK, NEW_BLOCK, 1)
        path.write_text(text)
        patched = True

if patched:
    print("[setup] Patched Transformer Engine loader for local wheel support")
else:
    print("[setup] Transformer Engine loader patch skipped (already applied)")
PY
}

install_proton_cli_stub() {
    if command -v proton >/dev/null 2>&1; then
        echo "Proton CLI already available (proton command found)"
        return 0
    fi
    local target="/usr/local/bin/proton"
    install -m 755 "${PROJECT_ROOT}/core/scripts/proton_stub.py" "${target}"
    echo "Installed Proton stub CLI at ${target}"
}

install_aisp_cli_wrapper() {
    local target="/usr/local/bin/aisp"
    cat > "${target}" <<EOF
#!/usr/bin/env bash
set -euo pipefail
exec python3 "${PROJECT_ROOT}/cli/aisp.py" "\$@"
EOF
    chmod 755 "${target}"
    echo "Installed aisp CLI wrapper at ${target}"
}

ensure_codex_cli() {
    if command -v codex >/dev/null 2>&1; then
        CODEX_BIN="$(command -v codex)"
        export CODEX_BIN
        echo "Codex CLI detected at ${CODEX_BIN}"
        return 0
    fi

    if [ -z "${CODEX_INSTALL_CMD:-}" ]; then
        echo "ERROR: codex CLI not found. Set CODEX_INSTALL_CMD to install it before MCP setup." >&2
        exit 1
    fi

    echo "Installing Codex CLI..."
    if [ -n "${SUDO_USER:-}" ] && [ "${SUDO_USER}" != "root" ] && [ "${CODEX_INSTALL_AS_USER:-1}" -eq 1 ]; then
        sudo -H -u "${SUDO_USER}" bash -lc "${CODEX_INSTALL_CMD}"
    else
        bash -lc "${CODEX_INSTALL_CMD}"
    fi

    if command -v codex >/dev/null 2>&1; then
        CODEX_BIN="$(command -v codex)"
        export CODEX_BIN
        echo "Codex CLI installed at ${CODEX_BIN}"
        return 0
    fi

    if [ -n "${SUDO_USER:-}" ] && [ "${SUDO_USER}" != "root" ]; then
        local user_home
        user_home="$(getent passwd "${SUDO_USER}" | cut -d: -f6)"
        if [ -n "${user_home}" ] && [ -x "${user_home}/.local/bin/codex" ]; then
            CODEX_BIN="${user_home}/.local/bin/codex"
            export CODEX_BIN
            echo "Codex CLI detected at ${CODEX_BIN}"
            return 0
        fi
    fi

    echo "ERROR: codex CLI not found after install. Ensure it is in PATH or set CODEX_BIN to its full path." >&2
    exit 1
}

remove_conflicting_user_triton() {
    if [ -z "${SUDO_USER:-}" ] || [ "${SUDO_USER}" = "root" ]; then
        return 0
    fi
    local user_site
    user_site=$(sudo -H -u "${SUDO_USER}" python3 -c "import site; print(site.getusersitepackages())" 2>/dev/null) || true
    if [ -z "${user_site}" ]; then
        return 0
    fi
    if sudo -H -u "${SUDO_USER}" test -d "${user_site}/triton"; then
        rm -rf "${user_site}/triton"
    fi
    sudo -H -u "${SUDO_USER}" sh -c "rm -rf ${user_site}/pytorch_triton-*.dist-info" 2>/dev/null || true
}

remove_usercustomize_shim() {
    local targets=(
        "$HOME/.local/lib/python3.12/site-packages/usercustomize.py"
        "/usr/local/lib/python3.12/dist-packages/usercustomize.py"
    )
    for target in "${targets[@]}"; do
        if [ -f "$target" ]; then
            rm -f "$target"
            echo "[setup] Removed legacy usercustomize shim at $target"
        fi
    done
}

disable_transformer_engine_sanity_check() {
    python3 <<'PY'
import ast
import importlib.metadata as metadata
from importlib.metadata import PackageNotFoundError
from pathlib import Path

def patch_module(module_path: Path) -> bool:
    if not module_path.exists():
        return False
    source = module_path.read_text()
    try:
        tree = ast.parse(source)
    except SyntaxError:
        return False
    target = None
    for node in tree.body:
        if isinstance(node, ast.FunctionDef) and node.name == "sanity_checks_for_pypi_installation":
            target = node
            break
    if target is None or target.lineno is None or target.end_lineno is None:
        return False
    lines = source.splitlines()
    replacement = [
        "def sanity_checks_for_pypi_installation() -> None:",
        "    \"\"\"Runtime environment bundles TE wheels directly; skip PyPI provenance checks.\"\"\"",
        "    return None",
        "",
    ]
    start = target.lineno - 1
    end = target.end_lineno
    lines[start:end] = replacement
    module_path.write_text("\n".join(lines) + ("\n" if lines and lines[-1] else ""))
    return True

patched_any = False
for dist_name in ("transformer_engine", "transformer-engine"):
    try:
        dist = metadata.distribution(dist_name)
    except PackageNotFoundError:
        continue
    module_path = Path(dist.locate_file("transformer_engine/common/__init__.py"))
    if patch_module(module_path):
        print(f"[setup] Patched Transformer Engine sanity check at {module_path}")
        patched_any = True

if not patched_any:
    print("[setup] Warning: transformer_engine.common not patched (module not found)")
PY
}

patch_transformer_engine_loader
install_proton_cli_stub
install_aisp_cli_wrapper
remove_conflicting_user_triton
remove_usercustomize_shim

verify_fp8_functionality() {
    python3 <<'PY'
import torch
status = {
    "torchao": {"ok": False, "error": ""},
    "transformer_engine": {"ok": False, "error": ""},
}

def torchao_fp8_check():
    try:
        from torchao.float8 import Float8LinearConfig, convert_to_float8_training
    except Exception as exc:
        return False, f"torchao import failed: {exc}"
    try:
        model = torch.nn.Sequential(torch.nn.Linear(128, 128, bias=False)).cuda().half()
        model = convert_to_float8_training(model, config=Float8LinearConfig())
        x = torch.randn(32, 128, device="cuda", dtype=torch.float16, requires_grad=True)
        y = model(x)
        y.float().sum().backward()
        torch.cuda.synchronize()
        return True, ""
    except Exception as exc:
        return False, str(exc)

def te_fp8_check():
    try:
        import transformer_engine.pytorch as te
    except Exception as exc:
        msg = str(exc)
        if "flash_attn" in msg or "flash_attn_2_cuda" in msg or "undefined symbol" in msg:
            return None, f"Transformer Engine import skipped due to FlashAttention issue: {exc}"
        return False, f"Transformer Engine import failed: {exc}"
    try:
        layer = te.Linear(128, 128, bias=False).to(torch.bfloat16).cuda()
        x = torch.randn(32, 128, device="cuda", dtype=torch.bfloat16, requires_grad=True)
        with te.fp8_autocast(enabled=True):
            y = layer(x)
        y.float().sum().backward()
        torch.cuda.synchronize()
        return True, ""
    except Exception as exc:
        return False, str(exc)

status["torchao"]["ok"], status["torchao"]["error"] = torchao_fp8_check()
status["transformer_engine"]["ok"], status["transformer_engine"]["error"] = te_fp8_check()

for name, result in status.items():
    if result["ok"] is True:
        print(f"[setup] ✓ {name} FP8 runtime check passed")
    elif result["ok"] is None:
        print(f"[setup] ⚠ {name} FP8 runtime check skipped: {result['error']}")
    else:
        print(f"[setup] ERROR: {name} FP8 runtime check failed: {result['error']}")

if not all(entry["ok"] or entry["ok"] is None for entry in status.values()):
    raise SystemExit(1)
PY
}

TORCHAO_EXTRA_INDEX_URL="${PYTORCH_CU130_INDEX}"

# Check Ubuntu version
if ! command -v lsb_release &> /dev/null; then
    echo "Installing lsb-release..."
    apt update && apt install -y lsb-release
fi

UBUNTU_VERSION=$(lsb_release -rs)
echo "Detected Ubuntu version: $UBUNTU_VERSION"

if [[ "$UBUNTU_VERSION" != "22.04" && "$UBUNTU_VERSION" != "20.04" ]]; then
    echo "Warning: This script is tested on Ubuntu 22.04. Other versions may work but are not guaranteed."
fi

echo ""
echo "Configuring inotify watch limit for large workspaces..."
TARGET_INOTIFY_WATCHES=524288
CURRENT_INOTIFY_WATCHES=0
if [ -r /proc/sys/fs/inotify/max_user_watches ]; then
    CURRENT_INOTIFY_WATCHES=$(cat /proc/sys/fs/inotify/max_user_watches)
fi

if [ "$CURRENT_INOTIFY_WATCHES" -lt "$TARGET_INOTIFY_WATCHES" ]; then
    if grep -q '^fs.inotify.max_user_watches' /etc/sysctl.conf 2>/dev/null; then
        sed -i "s/^fs\.inotify\.max_user_watches=.*/fs.inotify.max_user_watches=${TARGET_INOTIFY_WATCHES}/" /etc/sysctl.conf
    else
        echo "fs.inotify.max_user_watches=${TARGET_INOTIFY_WATCHES}" >> /etc/sysctl.conf
    fi

    if sysctl -w fs.inotify.max_user_watches="${TARGET_INOTIFY_WATCHES}" >/dev/null 2>&1; then
        echo "Set fs.inotify.max_user_watches=${TARGET_INOTIFY_WATCHES} (consumes up to ~540 MiB if fully utilized)."
    else
        echo "Warning: Failed to apply inotify watch limit via sysctl; please verify manually."
    fi
else
    echo "fs.inotify.max_user_watches already set to ${CURRENT_INOTIFY_WATCHES}."
fi

# Check for NVIDIA GPU
echo ""
echo "Checking for NVIDIA GPU..."
if command -v nvidia-smi &> /dev/null; then
    nvidia-smi
    echo "NVIDIA GPU detected"

    DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | head -n 1 | tr -d ' ')
    if [[ -n "$DRIVER_VERSION" ]]; then
        if ! dpkg --compare-versions "$DRIVER_VERSION" ge "$REQUIRED_DRIVER_VERSION"; then
            echo "Current NVIDIA driver: $DRIVER_VERSION"
            echo "CUDA ${CUDA_SHORT_VERSION} Update 2 requires driver ${REQUIRED_DRIVER_VERSION}+."
            echo "This script will upgrade it automatically."
        else
            echo "NVIDIA driver version: $DRIVER_VERSION (compatible with CUDA ${CUDA_SHORT_VERSION} Update 2)"
        fi
    fi
else
    echo "NVIDIA GPU not detected. Please ensure NVIDIA drivers are installed."
    exit 1
fi

GPU_COMPUTE_CAP_RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n 1 | tr -d '[:space:]')
GPU_COMPUTE_SM_NUM=$(echo "${GPU_COMPUTE_CAP_RAW}" | tr -d '.')
FLASH_ATTN_TARGET_SM="${GPU_COMPUTE_SM_NUM:-121}"
FLASH_ATTN_TARGET_ARCHS="${FLASH_ATTENTION_CUDA_ARCHS:-${FLASH_ATTN_TARGET_SM}}"
export FLASH_ATTN_TARGET_SM FLASH_ATTN_TARGET_ARCHS
if [[ -n "${GPU_COMPUTE_SM_NUM}" ]]; then
    echo "Detected GPU compute capability: sm_${GPU_COMPUTE_SM_NUM}"
    SOURCE_BUILD_ALLOWED=0
    export GPU_COMPUTE_SM_NUM
else
    SOURCE_BUILD_ALLOWED=0
    echo "Could not detect GPU compute capability; defaulting to prebuilt wheels only."
fi

# Use current arch for CUTLASS build targets (fallback to defaults if unknown)
if [ -n "${GPU_COMPUTE_SM_NUM}" ]; then
    CUTLASS_NVCC_ARCHS_VALUE="${GPU_COMPUTE_SM_NUM}"
else
    CUTLASS_NVCC_ARCHS_VALUE="${CUTLASS_NVCC_ARCHS_VALUE_DEFAULT}"
fi
if [ -n "${GPU_COMPUTE_SM_NUM}" ]; then
    sm_len=${#GPU_COMPUTE_SM_NUM}
    sm_major="${GPU_COMPUTE_SM_NUM:0:$((sm_len-1))}"
    sm_minor="${GPU_COMPUTE_SM_NUM: -1}"
    # Use Blackwell 'a' suffix to enable TMA instructions in ptxas
    if [ "${GPU_COMPUTE_SM_NUM}" -eq 100 ]; then
        TE_TORCH_ARCH_LIST="${sm_major}.${sm_minor}a"
        TE_CUTLASS_ARCHS="100a"
    else
        TE_TORCH_ARCH_LIST="${sm_major}.${sm_minor}"
        TE_CUTLASS_ARCHS="${GPU_COMPUTE_SM_NUM}"
    fi
else
    TE_TORCH_ARCH_LIST="10.0"
    TE_CUTLASS_ARCHS="${CUTLASS_NVCC_ARCHS_VALUE_DEFAULT}"
fi

# Ensure open kernel modules are enabled for Blackwell GPUs
MODPROBE_CONF="/etc/modprobe.d/nvidia-open.conf"
if [[ ! -f "$MODPROBE_CONF" ]] || ! grep -q "NVreg_OpenRmEnableUnsupportedGpus=1" "$MODPROBE_CONF"; then
    echo "Configuring NVIDIA open kernel modules for Blackwell GPUs..."
    cat <<'EOF' > "$MODPROBE_CONF"
options nvidia NVreg_OpenRmEnableUnsupportedGpus=1 NVreg_RestrictProfilingToAdminUsers=0
EOF
    update-initramfs -u
    if lsmod | grep -q "^nvidia"; then
        echo "Reloading NVIDIA kernel modules to enable profiling counters..."
        systemctl stop nvidia-persistenced >/dev/null 2>&1 || true
        for module in nvidia_uvm nvidia_peermem nvidia_modeset nvidia_drm nvidia; do
            if lsmod | grep -q "^${module}"; then
                modprobe -r "${module}" >/dev/null 2>&1 || true
            fi
        done
        modprobe nvidia NVreg_OpenRmEnableUnsupportedGpus=1 NVreg_RestrictProfilingToAdminUsers=0 >/dev/null 2>&1 || true
        for module in nvidia_modeset nvidia_uvm nvidia_peermem; do
            modprobe "${module}" >/dev/null 2>&1 || true
        done
        systemctl start nvidia-persistenced >/dev/null 2>&1 || true
    fi
fi

# Update system packages
echo ""
echo "Updating system packages..."

# Fix apt_pkg module before apt update (if Python was upgraded)
if ! python3 -c "import apt_pkg" 2>/dev/null; then
    echo "Fixing apt_pkg module..."
    apt install -y --reinstall python3-apt 2>/dev/null || true
fi

# Disable command-not-found APT hook if it's causing issues with Python upgrade
if [ -f /etc/apt/apt.conf.d/50command-not-found ] && ! /usr/lib/cnf-update-db 2>/dev/null; then
    echo "Disabling problematic command-not-found APT hook..."
    rm -f /etc/apt/apt.conf.d/50command-not-found
fi