-
Notifications
You must be signed in to change notification settings - Fork 174
Expand file tree
/
Copy pathsetup.sh
More file actions
executable file
·3837 lines (3435 loc) · 142 KB
/
setup.sh
File metadata and controls
executable file
·3837 lines (3435 loc) · 142 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/bash
#
# AI Performance Engineering Setup Script
# ========================================
#
# This script installs EVERYTHING you need:
# 1. NVIDIA Driver 580.126.09 (auto-upgrades if needed; open kernel modules for B200)
# 2. Python 3.12 (PyTorch 2.10-dev compatible)
# 3. CUDA 13.0.2 toolkit + cuBLAS 13.1.0.3 (Update 2) repository
# 4. Environment for PyTorch 2.10-dev source build with CUDA 13.0.2
# 5. NVIDIA Nsight Systems 2025.3.2 (for timeline profiling)
# 6. NVIDIA Nsight Compute 2025.3.1 (for kernel metrics)
# 7. All Python dependencies from requirements_latest.txt
# 8. System tools (numactl, perf, htop, etc.)
# 9. Configures NVIDIA drivers for profiling
#
# Requirements:
# - Ubuntu 22.04+ (tested on 22.04)
# - NVIDIA B200/B300 GPU (or compatible)
# - sudo/root access
# - Internet connection
#
# Usage:
# sudo ./setup.sh
# (logs to ./setup.log by default; override with SETUP_LOG_FILE)
#
# Duration: 10-20 minutes (first run may require reboot for driver upgrade)
#
# What it does:
# - Adds official NVIDIA CUDA 13.0 (Update 2) repository
# - Configures APT to prefer official NVIDIA packages
# - Fixes Python APT module (python3-apt) compatibility
# - Disables problematic command-not-found APT hook
# - Removes duplicate deadsnakes repository entries
# - Upgrades Python to 3.12 (required by PyTorch 2.10 dev builds)
# - Auto-upgrades NVIDIA driver to 580+ if needed (will prompt reboot)
# - Installs CUDA 13.0.2 toolkit and libraries
# - Installs latest Nsight tools (2025.x)
# - Prepares for PyTorch 2.10-dev (source build) with CUDA 13.0.2
# - Removes conflicting system packages (python3-optree, etc.)
# - Installs nvidia-ml-py (replaces deprecated pynvml)
# - Configures NVIDIA kernel modules for profiling
# - Fixes hardware info script compatibility
# - Runs validation tests
#
# Notes:
# - If driver upgrade is needed, script will exit and ask you to reboot
# - After reboot, simply re-run: sudo ./setup.sh
# - The script is idempotent and safe to re-run
# - Disk cleanup is enabled by default (CLEAN_APT_CACHE/CLEAN_PIP_CACHE/CLEAN_BUILD_ARTIFACTS)
# - Minimum free space thresholds: SETUP_MIN_FREE_GB / SETUP_MIN_TE_BUILD_FREE_GB
#
# After running this script, you can:
# - Run examples: python3 ch01/performance_basics.py
# - Drive the benchmark suite: python -m cli.aisp bench run
# - Capture peak performance: python core/benchmark/benchmark_peak.py
#
set -e # Exit on any error
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="${SCRIPT_DIR}"
# =============================================================================
# Logging + Disk Hygiene (always-on logging, keep disk usage tidy)
# =============================================================================
LOG_FILE="${SETUP_LOG_FILE:-${PROJECT_ROOT}/setup.log}"
LOG_MAX_BYTES="${SETUP_LOG_MAX_BYTES:-104857600}" # 100 MB
LOG_COMPRESS="${SETUP_LOG_COMPRESS:-1}"
SETUP_MIN_FREE_GB="${SETUP_MIN_FREE_GB:-6}"
SETUP_MIN_TE_BUILD_FREE_GB="${SETUP_MIN_TE_BUILD_FREE_GB:-8}"
CLEAN_APT_CACHE="${CLEAN_APT_CACHE:-1}"
CLEAN_PIP_CACHE="${CLEAN_PIP_CACHE:-1}"
CLEAN_BUILD_ARTIFACTS="${CLEAN_BUILD_ARTIFACTS:-1}"
rotate_setup_log() {
if [ -f "${LOG_FILE}" ]; then
local log_size
log_size="$(stat -c %s "${LOG_FILE}" 2>/dev/null || echo 0)"
if [ "${log_size}" -ge "${LOG_MAX_BYTES}" ]; then
local ts rotated
ts="$(date +%Y%m%d_%H%M%S)"
rotated="${LOG_FILE}.${ts}"
mv "${LOG_FILE}" "${rotated}" 2>/dev/null || true
if [ "${LOG_COMPRESS}" -eq 1 ] && command -v gzip >/dev/null 2>&1; then
gzip -f "${rotated}" >/dev/null 2>&1 || true
fi
fi
fi
}
start_setup_logging() {
local log_dir
log_dir="$(dirname "${LOG_FILE}")"
mkdir -p "${log_dir}"
rotate_setup_log
if [ "$(id -u)" -eq 0 ] && [ -n "${SUDO_USER:-}" ]; then
touch "${LOG_FILE}" 2>/dev/null || true
chown "${SUDO_USER}":"${SUDO_USER}" "${LOG_FILE}" 2>/dev/null || true
fi
exec > >(tee -a "${LOG_FILE}") 2>&1
echo "Logging to ${LOG_FILE}"
}
get_free_kb() {
df -Pk "${PROJECT_ROOT}" | awk 'NR==2 {print $4}'
}
reclaim_disk_space_basic() {
echo "Reclaiming disk space (APT + pip caches)..."
if [ "${CLEAN_APT_CACHE}" -eq 1 ] && [ "$(id -u)" -eq 0 ]; then
apt-get clean >/dev/null 2>&1 || true
rm -rf /var/lib/apt/lists/* >/dev/null 2>&1 || true
fi
if [ "${CLEAN_PIP_CACHE}" -eq 1 ]; then
python3 -m pip cache purge >/dev/null 2>&1 || true
rm -rf /root/.cache/pip >/dev/null 2>&1 || true
fi
}
ensure_free_space_gb() {
local min_gb="$1"
local reason="$2"
local free_kb free_gb
free_kb="$(get_free_kb)"
free_gb=$((free_kb / 1024 / 1024))
if [ "${free_gb}" -lt "${min_gb}" ]; then
echo "Low disk space: ${free_gb} GB free (need ${min_gb} GB) for ${reason}."
reclaim_disk_space_basic
free_kb="$(get_free_kb)"
free_gb=$((free_kb / 1024 / 1024))
if [ "${free_gb}" -lt "${min_gb}" ]; then
echo "ERROR: Insufficient disk space after cleanup (${free_gb} GB free)."
echo " Free up space or set SETUP_MIN_FREE_GB/SETUP_MIN_TE_BUILD_FREE_GB."
exit 1
fi
fi
}
start_setup_logging
ensure_free_space_gb "${SETUP_MIN_FREE_GB}" "setup start"
echo "AI Performance Engineering Setup Script"
echo "=========================================="
echo "This script will install:"
echo " • NVIDIA Driver 580.126.09 (auto-upgrade if needed)"
echo " • Python 3.12 (PyTorch 2.10-dev compatible)"
echo " • CUDA 13.0.2 toolkit + cuBLAS 13.1.0.3 (Update 2) repository"
echo " • Environment configured for PyTorch 2.10-dev source build"
echo " • NVIDIA Nsight Systems 2025.3.2 (latest)"
echo " • NVIDIA Nsight Compute 2025.3.1 (latest)"
echo " • All project dependencies"
echo " • System tools (numactl, perf, etc.)"
echo ""
echo "Note: If driver upgrade is needed, you'll be prompted to reboot."
echo ""
REQUIRED_DRIVER_VERSION="580.126.09"
PYTHON_TARGET_VERSION="3.12"
PYTHON_TARGET_MAJOR="${PYTHON_TARGET_VERSION%%.*}"
PYTHON_TARGET_MINOR="${PYTHON_TARGET_VERSION##*.}"
PYTHON_TARGET_BIN="python${PYTHON_TARGET_VERSION}"
PYTHON_ABI_TAG="cp${PYTHON_TARGET_MAJOR}${PYTHON_TARGET_MINOR}"
PYTHON_DIST_PACKAGES="/usr/local/lib/python${PYTHON_TARGET_VERSION}/dist-packages"
CUDA_SHORT_VERSION="13.0"
CUDA_FULL_VERSION="13.0.2"
# cuBLAS is pinned to 13.1.0.3 (Update 2) via cuda-libraries; cuDNN latest in repo
CUDNN_VERSION="9.16.0.29"
NCCL_SHORT_VERSION="2.28.7"
CUDA_HOME_DIR="/usr/local/cuda-${CUDA_SHORT_VERSION}"
THIRD_PARTY_DIR="${PROJECT_ROOT}/third_party"
mkdir -p "${THIRD_PARTY_DIR}"
FLASH_ATTN_TAG="${FLASH_ATTN_TAG:-v2.8.3}"
FLASH_ATTN_ARCH="$(uname -m)"
if [ "${FLASH_ATTN_ARCH}" = "arm64" ]; then
FLASH_ATTN_ARCH="aarch64"
fi
FLASH_ATTN_WHEEL_BASENAME="flash_attn-2.8.3-${PYTHON_ABI_TAG}-${PYTHON_ABI_TAG}-linux_${FLASH_ATTN_ARCH}.whl"
FLASH_ATTN_EXPECTED_VERSION="${FLASH_ATTN_TAG#v}"
detect_default_sm() {
if [ -n "${GPU_COMPUTE_SM_NUM:-}" ]; then
echo "${GPU_COMPUTE_SM_NUM}"
return
fi
if command -v nvidia-smi >/dev/null 2>&1; then
local cap
cap="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n 1 | tr -d '[:space:]')"
if [ -n "${cap}" ]; then
local major="${cap%.*}"
local minor="${cap#*.}"
printf "%s%s\n" "${major}" "${minor}"
return
fi
fi
echo "121"
}
FLASH_ATTENTION_FORCE_CUDA_SM_VALUE="${FLASH_ATTENTION_FORCE_CUDA_SM_VALUE:-$(detect_default_sm)}"
VLLM_REPO_URL="${VLLM_REPO_URL:-https://github.com/vllm-project/vllm.git}"
VLLM_VERSION_TAG="${VLLM_VERSION_TAG:-main}"
INSTALL_GPT_OSS="${INSTALL_GPT_OSS:-0}"
VLLM_GIT_REF="${VLLM_GIT_REF:-${VLLM_VERSION_TAG}}"
VLLM_SRC_DIR="${VLLM_SRC_DIR:-${THIRD_PARTY_DIR}/vllm-src}"
VLLM_WHEEL_DIR="${THIRD_PARTY_DIR}/wheels"
VLLM_WHEEL_INFO_PATH="${VLLM_WHEEL_INFO_PATH:-${VLLM_WHEEL_DIR}/vllm-build-info.json}"
VLLM_WHEEL_ARCH="$(uname -m)"
VLLM_EXTRA_INDEX_URL="${VLLM_EXTRA_INDEX_URL:-https://wheels.vllm.ai/cu130}"
VLLM_PIP_SPEC="${VLLM_PIP_SPEC:-vllm==0.16.0}"
FLASHINFER_EXPECTED_VERSION="${FLASHINFER_EXPECTED_VERSION:-0.6.3}"
TORCHTITAN_TOMLI_VERSION="${TORCHTITAN_TOMLI_VERSION:-2.4.0}"
TORCHTITAN_TYRO_VERSION="${TORCHTITAN_TYRO_VERSION:-1.0.10}"
TORCHTITAN_RUNTIME_DEPS=(
"tomli==${TORCHTITAN_TOMLI_VERSION}"
"tyro==${TORCHTITAN_TYRO_VERSION}"
)
VLLM_RUNTIME_DEPS=(
"cbor2==5.8.0"
"msgspec==0.20.0"
"gguf==0.18.0"
"ijson==3.5.0"
"pybase64==1.4.3"
"setproctitle==1.3.7"
"diskcache==5.6.3"
"partial-json-parser==0.2.1.1.post7"
"lm-format-enforcer==0.11.3"
"outlines_core==0.2.11"
"llguidance==1.3.0"
"xgrammar==0.1.29"
"compressed-tensors==0.13.0"
"depyf==0.20.0"
"watchfiles==1.1.1"
"blake3==1.0.8"
"anthropic==0.84.0"
"openai==2.24.0"
"openai-harmony==0.0.8"
"model-hosting-container-standards==0.1.13"
"mcp==1.26.0"
"grpcio-reflection==1.78.0"
)
if [ "${VLLM_WHEEL_ARCH}" = "arm64" ]; then
VLLM_WHEEL_ARCH="aarch64"
fi
VLLM_WHEEL_PATTERN="${VLLM_WHEEL_PATTERN:-${VLLM_WHEEL_DIR}/vllm-*-${PYTHON_ABI_TAG}-${PYTHON_ABI_TAG}-linux_${VLLM_WHEEL_ARCH}.whl}"
# =============================================================================
# DEPENDENCY VERSION PINS (update together, test after changes)
# Run: python core/scripts/check_upstream_versions.py --check-te-cutlass
# Source of truth for pinned versions lives here (dependency_versions.json removed)
# =============================================================================
#
# CUTLASS 4.3.0 - Required for SM100a (Blackwell) support
# - Provides: tmem_allocator_sm100.hpp, mma_sm100_umma.hpp, copy_traits_sm100.hpp
# - Commit e67e63c331d6 is post-release with corrected version.h
#
# TransformerEngine v2.9 - Stable release with CUDA 13 wheels
# - IMPORTANT: TE v2.9 still bundles CUTLASS 4.2.0 (commit 57e3cfb47a2d)
# - CUTLASS 4.2.0 LACKS SM100a headers - symlink workaround REQUIRED
# - We replace TE's bundled CUTLASS with our 4.3.0 via symlink
# - Check: make verify-cutlass
#
# When to remove symlink workaround:
# - When TE bundles CUTLASS >= 4.3.0 with SM100a headers
# - Run: python core/scripts/check_upstream_versions.py --check-te-cutlass
# - If "TE main bundles: CUTLASS 4.3.0+" appears, symlink may be removable
#
TE_REPO_URL="${TE_REPO_URL:-https://github.com/NVIDIA/TransformerEngine.git}"
# TE v2.9 release (2025-11-11) - stable release with CUDA 13 support
TE_GIT_COMMIT="${TE_GIT_COMMIT:-v2.9}"
TE_VERSION="v2.9"
TE_BUNDLED_CUTLASS_VERSION="4.2.0" # What TE bundles (needs symlink override)
TE_SRC_DIR="${TE_SRC_DIR:-${THIRD_PARTY_DIR}/TransformerEngine-src}"
CUTLASS_REPO_URL="${CUTLASS_REPO_URL:-https://github.com/NVIDIA/cutlass.git}"
# CUTLASS 4.3.0 release tag
CUTLASS_REF="${CUTLASS_REF:-v4.3.0}"
CUTLASS_TARGET_VERSION="${CUTLASS_TARGET_VERSION:-4.3.0}"
CUTLASS_SRC_DIR="${CUTLASS_SRC_DIR:-${THIRD_PARTY_DIR}/cutlass}"
PIP_ROOT_USER_ACTION="ignore"
SOURCE_BUILD_ALLOWED=0
GPU_COMPUTE_SM_NUM=""
VLLM_PREBUILT_INSTALLED=0
export PROJECT_ROOT REQUIRED_DRIVER_VERSION PYTHON_TARGET_VERSION PYTHON_TARGET_MAJOR PYTHON_TARGET_MINOR PYTHON_TARGET_BIN PYTHON_ABI_TAG PYTHON_DIST_PACKAGES PIP_ROOT_USER_ACTION
if command -v git >/dev/null 2>&1; then
git config --global --add safe.directory "${PROJECT_ROOT}" >/dev/null 2>&1 || true
if [ -d "${PROJECT_ROOT}/vendor/pytorch-src" ]; then
git config --global --add safe.directory "${PROJECT_ROOT}/vendor/pytorch-src" >/dev/null 2>&1 || true
fi
if git -C "${PROJECT_ROOT}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
if [ -f "${PROJECT_ROOT}/.gitmodules" ]; then
git -C "${PROJECT_ROOT}" submodule sync --recursive >/dev/null 2>&1 || true
git -C "${PROJECT_ROOT}" submodule update --init --recursive >/dev/null 2>&1 || true
fi
fi
fi
PYTORCH_REPO_URL="${PYTORCH_REPO_URL:-https://github.com/pytorch/pytorch.git}"
PYTORCH_COMMIT="${PYTORCH_COMMIT:-main}"
PYTORCH_SRC_DIR="${PYTORCH_SRC_DIR:-${THIRD_PARTY_DIR}/pytorch-src}"
PYTORCH_BUILD_DIR="${PYTORCH_SRC_DIR}"
PYTORCH_DIST_DIR="${PYTORCH_BUILD_DIR}/dist"
PYTORCH_WHEEL_DIR="${THIRD_PARTY_DIR}/wheels"
PYTORCH_WHEEL_PATTERN="${PYTORCH_WHEEL_PATTERN:-torch-*-${PYTHON_ABI_TAG}-${PYTHON_ABI_TAG}-*.whl}"
mkdir -p "${PYTORCH_WHEEL_DIR}"
TORCH_CUDA_ARCH_LIST_VALUE="10.0;10.3;12.1;12.2+PTX"
CMAKE_CUDA_ARCH_LIST_VALUE="100;103;121;122"
TORCH_SM_ARCH_LIST_VALUE="sm_100;sm_103;sm_121;sm_122"
CUTLASS_NVCC_ARCHS_VALUE_DEFAULT="100;103;121;122"
CUTLASS_NVCC_ARCHS_VALUE="${CUTLASS_NVCC_ARCHS_VALUE_DEFAULT}"
PYTORCH_NIGHTLY_DATE="20251213"
PYTORCH_TORCH_VERSION="2.10.0.dev${PYTORCH_NIGHTLY_DATE}+cu130"
# PYTORCH_TORCHVISION_VERSION="0.25.0.dev${PYTORCH_NIGHTLY_DATE}+cu130"
PYTORCH_TORCHAUDIO_VERSION="2.10.0.dev${PYTORCH_NIGHTLY_DATE}+cu130"
PYTORCH_TORCHAO_VERSION="0.16.0.dev${PYTORCH_NIGHTLY_DATE}+cu130"
PYTORCH_TRITON_VERSION="3.6.0+git8fedd49b"
PYTORCH_NIGHTLY_INDEX="https://download.pytorch.org/whl/nightly"
PYTORCH_CU130_INDEX_ROOT="https://download.pytorch.org/whl/nightly/cu130"
PYTORCH_CU130_INDEX="${PYTORCH_CU130_INDEX_ROOT}"
PYTORCH_TORCH_FIND_LINKS="${PYTORCH_TORCH_FIND_LINKS:-https://download.pytorch.org/whl/nightly/cu130/torch/}"
GPU_CLOCK_SERVICE_PATH="/etc/systemd/system/gpu-clock-pin.service"
echo "Project root: $PROJECT_ROOT"
cd "$PROJECT_ROOT"
# Check if running as root
if [[ $EUID -eq 0 ]]; then
echo "Running as root. This is fine for containerized environments."
else
echo "This script requires root privileges. Please run with sudo."
exit 1
fi
lock_gpu_clocks_if_supported() {
# Best-effort: lock SM clocks to the max supported value to reduce run-to-run noise.
# Skips silently if not supported or if nvidia-smi is unavailable.
if ! command -v nvidia-smi >/dev/null 2>&1; then
return
fi
local gpu_name sm_max mem_max
gpu_name="$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -n 1 | tr -d '\r')"
sm_max="$(nvidia-smi --query-gpu=clocks.max.sm --format=csv,noheader 2>/dev/null | head -n 1 | awk '{print $1}')"
mem_max="$(nvidia-smi --query-gpu=clocks.max.mem --format=csv,noheader 2>/dev/null | head -n 1 | awk '{print $1}')"
if [ -z "${sm_max}" ]; then
echo "Skipping clock lock: sm_max not available from nvidia-smi."
return
fi
echo "Attempting to lock SM clocks for ${gpu_name:-GPU} to ${sm_max} MHz (best-effort; may require admin privileges)..."
if nvidia-smi -lgc "${sm_max},${sm_max}" >/dev/null 2>&1; then
echo " Locked SM clocks to ${sm_max} MHz."
else
echo " SM clock lock not supported on this GPU/driver; continuing without lock."
fi
if [ -n "${mem_max}" ]; then
if nvidia-smi --lock-memory-clocks="${mem_max},${mem_max}" >/dev/null 2>&1; then
echo " Locked memory clocks to ${mem_max} MHz."
elif nvidia-smi --lock-memory-clocks="${mem_max}" >/dev/null 2>&1; then
echo " Locked memory clocks to ${mem_max} MHz (single-value interface)."
elif nvidia-smi --lock-memory-clocks-deferred="${mem_max}" >/dev/null 2>&1; then
echo " Deferred lock of memory clocks to ${mem_max} MHz (takes effect after driver reload)."
else
echo " Memory clock lock not supported here; continuing without lock."
fi
fi
}
install_gpu_clock_service() {
# Install a systemd unit to reapply clock locks on boot.
if ! command -v systemctl >/dev/null 2>&1; then
echo "systemctl not available; skipping GPU clock service install."
return
fi
cat > "${GPU_CLOCK_SERVICE_PATH}" <<'EOF'
[Unit]
Description=Pin GPU clocks for stability
After=multi-user.target
ConditionPathExists=/usr/bin/nvidia-smi
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/bin/bash -lc '\
sm=$(nvidia-smi --query-gpu=clocks.max.sm --format=csv,noheader | head -n1 | awk "{print \$1}") || exit 0; \
mem=$(nvidia-smi --query-gpu=clocks.max.mem --format=csv,noheader | head -n1 | awk "{print \$1}") || exit 0; \
nvidia-smi -pm 1 || true; \
nvidia-smi -lgc $${sm},$${sm} || true; \
nvidia-smi -ac $${mem},$${sm} || true; \
nvidia-smi --lock-memory-clocks-deferred=$${mem} || true; \
'
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload || true
systemctl enable --now gpu-clock-pin.service || true
}
apply_deferred_memory_lock_now() {
# Try to apply a deferred memory clock lock without a reboot.
# This is best-effort and will fail if GPU reset isn't supported or processes are running.
if ! command -v nvidia-smi >/dev/null 2>&1; then
return
fi
if nvidia-smi --gpu-reset >/dev/null 2>&1; then
echo "Applied deferred memory clock lock via nvidia-smi --gpu-reset."
else
echo "INFO: Could not trigger deferred memory lock without reboot; it will apply on next driver reload/boot."
fi
}
# Lock GPU clocks (best-effort) to reduce perf variance; safe to skip if unsupported.
lock_gpu_clocks_if_supported
install_gpu_clock_service
apply_deferred_memory_lock_now
pip_cmd() {
if [ -z "${PIP_SUPPORTS_BREAK_SYSTEM_PACKAGES:-}" ]; then
if python3 -m pip --help 2>&1 | grep -q -- '--break-system-packages'; then
PIP_SUPPORTS_BREAK_SYSTEM_PACKAGES=1
else
PIP_SUPPORTS_BREAK_SYSTEM_PACKAGES=0
fi
fi
if [ "${PIP_SUPPORTS_BREAK_SYSTEM_PACKAGES}" -eq 1 ]; then
python3 -m pip --break-system-packages "$@"
else
PIP_BREAK_SYSTEM_PACKAGES=1 python3 -m pip "$@"
fi
}
pip_install() {
pip_cmd install "$@"
}
pip_uninstall() {
pip_cmd uninstall "$@"
}
pip_wheel() {
pip_cmd wheel "$@"
}
pip_show() {
pip_cmd show "$@"
}
# Ensure a tool is reachable by adding a symlink in /usr/local/bin if found elsewhere.
ensure_tool_on_path() {
local tool_name="$1"
shift
local patterns=("$@")
local nullglob_state
nullglob_state="$(shopt -p nullglob || true)"
shopt -s nullglob
for pattern in "${patterns[@]}"; do
for candidate in $pattern; do
local target="$candidate"
if [ -d "$target" ] && [ -x "${target}/${tool_name}" ]; then
target="${target}/${tool_name}"
fi
if [ -x "$target" ]; then
ln -sf "$target" "/usr/local/bin/${tool_name}"
echo "Ensured ${tool_name} is on PATH via /usr/local/bin/${tool_name} -> ${target}"
if [ -n "$nullglob_state" ]; then
eval "$nullglob_state"
else
shopt -u nullglob
fi
return 0
fi
done
done
if [ -n "$nullglob_state" ]; then
eval "$nullglob_state"
else
shopt -u nullglob
fi
return 1
}
# Reusable function to reassemble split wheels
reassemble_split_wheel() {
local wheel_path="$1"
local tmp_dir="${2:-$(mktemp -d "${TMPDIR:-/tmp}/wheel-reassemble.XXXXXX")}"
# If full wheel exists, return it
if [ -f "${wheel_path}" ]; then
echo "${wheel_path}"
return 0
fi
# Check for split parts
if compgen -G "${wheel_path}.part*" >/dev/null 2>&1; then
local combined="${tmp_dir}/$(basename "${wheel_path}")"
mapfile -t PARTS < <(ls "${wheel_path}".part* | sort -V)
if cat "${PARTS[@]}" > "${combined}" 2>/dev/null; then
echo "${combined}"
return 0
fi
fi
return 1
}
# Reusable function to verify PyTorch CUDA and restore if needed
verify_and_restore_pytorch_cuda() {
local context="$1"
python3 <<'PY'
import sys
import torch
if not torch.cuda.is_available():
print("ERROR: PyTorch CUDA not available")
print(f" torch.__version__ = {torch.__version__}")
print(f" torch.version.cuda = {torch.version.cuda}")
sys.exit(1)
PY
if [ $? -ne 0 ]; then
echo "CRITICAL: PyTorch CUDA was not available during ${context}!"
return 1
fi
return 0
}
ensure_wheel_root_pure() {
local wheel_path="$1"
if [ -z "${wheel_path}" ] || [ ! -f "${wheel_path}" ]; then
return 0
fi
python3 - "${wheel_path}" <<'PY'
import shutil
import sys
import tempfile
import zipfile
from pathlib import Path
wheel_path = Path(sys.argv[1])
if not wheel_path.exists():
raise SystemExit(0)
with zipfile.ZipFile(wheel_path, "r") as src:
wheel_entries = [name for name in src.namelist() if name.endswith(".dist-info/WHEEL")]
if not wheel_entries:
raise SystemExit(0)
with tempfile.TemporaryDirectory(dir=str(wheel_path.parent)) as tmpdir:
tmp_path = Path(tmpdir) / wheel_path.name
with zipfile.ZipFile(tmp_path, "w") as dst:
for info in src.infolist():
data = src.read(info.filename)
if info.filename in wheel_entries:
text = data.decode("utf-8").splitlines()
for idx, line in enumerate(text):
if line.startswith("Root-Is-Purelib:"):
if line.strip().lower() != "root-is-purelib: false":
text[idx] = "Root-Is-Purelib: false"
break
else:
text.append("Root-Is-Purelib: false")
data = ("\n".join(text) + "\n").encode("utf-8")
dst.writestr(info, data)
shutil.move(tmp_path, wheel_path)
PY
}
patch_installed_transformer_engine_metadata() {
python3 <<'PY'
import importlib.metadata as metadata
from importlib.metadata import PackageNotFoundError
def patch_distribution(name: str) -> None:
try:
dist = metadata.distribution(name)
except PackageNotFoundError:
return
files = dist.files or []
wheel_entry = None
for file in files:
if file.name == "WHEEL":
wheel_entry = dist.locate_file(file)
break
if not wheel_entry:
return
lines = wheel_entry.read_text().splitlines()
for idx, line in enumerate(lines):
if line.startswith("Root-Is-Purelib:"):
if line.strip().lower() != "root-is-purelib: false":
lines[idx] = "Root-Is-Purelib: false"
break
else:
lines.append("Root-Is-Purelib: false")
wheel_entry.write_text("\n".join(lines) + "\n")
for dist_name in ("transformer_engine", "transformer_engine_torch", "transformer_engine_cu12"):
patch_distribution(dist_name)
PY
}
patch_transformer_engine_loader() {
python3 <<'PY'
from importlib.metadata import PackageNotFoundError, distribution
from pathlib import Path
OLD_BLOCK = ''' if te_framework_installed:
assert te_installed_via_pypi, "Could not find `transformer-engine` PyPI package."
assert te_core_installed, "Could not find TE core package `transformer-engine-cu*`."
assert version(module_name) == version("transformer-engine") == te_core_version, (
"Transformer Engine package version mismatch. Found"
f" {module_name} v{version(module_name)}, transformer-engine"
f" v{version('transformer-engine')}, and {te_core_package_name}"
f" v{te_core_version}. Install transformer-engine using "
f"'pip3 install --no-build-isolation transformer-engine[{extra_dep_name}]==VERSION'"
)
'''
NEW_BLOCK = ''' if te_framework_installed:
if te_installed_via_pypi and te_core_installed:
assert version(module_name) == version("transformer-engine") == te_core_version, (
"Transformer Engine package version mismatch. Found"
f" {module_name} v{version(module_name)}, transformer-engine"
f" v{version('transformer-engine')}, and {te_core_package_name}"
f" v{te_core_version}. Install transformer-engine using "
f"'pip3 install --no-build-isolation transformer-engine[{extra_dep_name}]==VERSION'"
)
else:
pass
'''
patched = False
for dist_name in ("transformer_engine", "transformer-engine"):
try:
dist = distribution(dist_name)
except PackageNotFoundError:
continue
path = Path(dist.locate_file("transformer_engine/common/__init__.py"))
if not path.exists():
continue
text = path.read_text()
if OLD_BLOCK in text:
text = text.replace(OLD_BLOCK, NEW_BLOCK, 1)
path.write_text(text)
patched = True
if patched:
print("[setup] Patched Transformer Engine loader for local wheel support")
else:
print("[setup] Transformer Engine loader patch skipped (already applied)")
PY
}
install_proton_cli_stub() {
if command -v proton >/dev/null 2>&1; then
echo "Proton CLI already available (proton command found)"
return 0
fi
local target="/usr/local/bin/proton"
install -m 755 "${PROJECT_ROOT}/core/scripts/proton_stub.py" "${target}"
echo "Installed Proton stub CLI at ${target}"
}
install_aisp_cli_wrapper() {
local target="/usr/local/bin/aisp"
cat > "${target}" <<EOF
#!/usr/bin/env bash
set -euo pipefail
exec python3 "${PROJECT_ROOT}/cli/aisp.py" "\$@"
EOF
chmod 755 "${target}"
echo "Installed aisp CLI wrapper at ${target}"
}
ensure_codex_cli() {
if command -v codex >/dev/null 2>&1; then
CODEX_BIN="$(command -v codex)"
export CODEX_BIN
echo "Codex CLI detected at ${CODEX_BIN}"
return 0
fi
if [ -z "${CODEX_INSTALL_CMD:-}" ]; then
echo "ERROR: codex CLI not found. Set CODEX_INSTALL_CMD to install it before MCP setup." >&2
exit 1
fi
echo "Installing Codex CLI..."
if [ -n "${SUDO_USER:-}" ] && [ "${SUDO_USER}" != "root" ] && [ "${CODEX_INSTALL_AS_USER:-1}" -eq 1 ]; then
sudo -H -u "${SUDO_USER}" bash -lc "${CODEX_INSTALL_CMD}"
else
bash -lc "${CODEX_INSTALL_CMD}"
fi
if command -v codex >/dev/null 2>&1; then
CODEX_BIN="$(command -v codex)"
export CODEX_BIN
echo "Codex CLI installed at ${CODEX_BIN}"
return 0
fi
if [ -n "${SUDO_USER:-}" ] && [ "${SUDO_USER}" != "root" ]; then
local user_home
user_home="$(getent passwd "${SUDO_USER}" | cut -d: -f6)"
if [ -n "${user_home}" ] && [ -x "${user_home}/.local/bin/codex" ]; then
CODEX_BIN="${user_home}/.local/bin/codex"
export CODEX_BIN
echo "Codex CLI detected at ${CODEX_BIN}"
return 0
fi
fi
echo "ERROR: codex CLI not found after install. Ensure it is in PATH or set CODEX_BIN to its full path." >&2
exit 1
}
remove_conflicting_user_triton() {
if [ -z "${SUDO_USER:-}" ] || [ "${SUDO_USER}" = "root" ]; then
return 0
fi
local user_site
user_site=$(sudo -H -u "${SUDO_USER}" python3 -c "import site; print(site.getusersitepackages())" 2>/dev/null) || true
if [ -z "${user_site}" ]; then
return 0
fi
if sudo -H -u "${SUDO_USER}" test -d "${user_site}/triton"; then
rm -rf "${user_site}/triton"
fi
sudo -H -u "${SUDO_USER}" sh -c "rm -rf ${user_site}/pytorch_triton-*.dist-info" 2>/dev/null || true
}
remove_usercustomize_shim() {
local targets=(
"$HOME/.local/lib/python3.12/site-packages/usercustomize.py"
"/usr/local/lib/python3.12/dist-packages/usercustomize.py"
)
for target in "${targets[@]}"; do
if [ -f "$target" ]; then
rm -f "$target"
echo "[setup] Removed legacy usercustomize shim at $target"
fi
done
}
disable_transformer_engine_sanity_check() {
python3 <<'PY'
import ast
import importlib.metadata as metadata
from importlib.metadata import PackageNotFoundError
from pathlib import Path
def patch_module(module_path: Path) -> bool:
if not module_path.exists():
return False
source = module_path.read_text()
try:
tree = ast.parse(source)
except SyntaxError:
return False
target = None
for node in tree.body:
if isinstance(node, ast.FunctionDef) and node.name == "sanity_checks_for_pypi_installation":
target = node
break
if target is None or target.lineno is None or target.end_lineno is None:
return False
lines = source.splitlines()
replacement = [
"def sanity_checks_for_pypi_installation() -> None:",
" \"\"\"Runtime environment bundles TE wheels directly; skip PyPI provenance checks.\"\"\"",
" return None",
"",
]
start = target.lineno - 1
end = target.end_lineno
lines[start:end] = replacement
module_path.write_text("\n".join(lines) + ("\n" if lines and lines[-1] else ""))
return True
patched_any = False
for dist_name in ("transformer_engine", "transformer-engine"):
try:
dist = metadata.distribution(dist_name)
except PackageNotFoundError:
continue
module_path = Path(dist.locate_file("transformer_engine/common/__init__.py"))
if patch_module(module_path):
print(f"[setup] Patched Transformer Engine sanity check at {module_path}")
patched_any = True
if not patched_any:
print("[setup] Warning: transformer_engine.common not patched (module not found)")
PY
}
patch_transformer_engine_loader
install_proton_cli_stub
install_aisp_cli_wrapper
remove_conflicting_user_triton
remove_usercustomize_shim
verify_fp8_functionality() {
python3 <<'PY'
import torch
status = {
"torchao": {"ok": False, "error": ""},
"transformer_engine": {"ok": False, "error": ""},
}
def torchao_fp8_check():
try:
from torchao.float8 import Float8LinearConfig, convert_to_float8_training
except Exception as exc:
return False, f"torchao import failed: {exc}"
try:
model = torch.nn.Sequential(torch.nn.Linear(128, 128, bias=False)).cuda().half()
model = convert_to_float8_training(model, config=Float8LinearConfig())
x = torch.randn(32, 128, device="cuda", dtype=torch.float16, requires_grad=True)
y = model(x)
y.float().sum().backward()
torch.cuda.synchronize()
return True, ""
except Exception as exc:
return False, str(exc)
def te_fp8_check():
try:
import transformer_engine.pytorch as te
except Exception as exc:
msg = str(exc)
if "flash_attn" in msg or "flash_attn_2_cuda" in msg or "undefined symbol" in msg:
return None, f"Transformer Engine import skipped due to FlashAttention issue: {exc}"
return False, f"Transformer Engine import failed: {exc}"
try:
layer = te.Linear(128, 128, bias=False).to(torch.bfloat16).cuda()
x = torch.randn(32, 128, device="cuda", dtype=torch.bfloat16, requires_grad=True)
with te.fp8_autocast(enabled=True):
y = layer(x)
y.float().sum().backward()
torch.cuda.synchronize()
return True, ""
except Exception as exc:
return False, str(exc)
status["torchao"]["ok"], status["torchao"]["error"] = torchao_fp8_check()
status["transformer_engine"]["ok"], status["transformer_engine"]["error"] = te_fp8_check()
for name, result in status.items():
if result["ok"] is True:
print(f"[setup] ✓ {name} FP8 runtime check passed")
elif result["ok"] is None:
print(f"[setup] ⚠ {name} FP8 runtime check skipped: {result['error']}")
else:
print(f"[setup] ERROR: {name} FP8 runtime check failed: {result['error']}")
if not all(entry["ok"] or entry["ok"] is None for entry in status.values()):
raise SystemExit(1)
PY
}
TORCHAO_EXTRA_INDEX_URL="${PYTORCH_CU130_INDEX}"
# Check Ubuntu version
if ! command -v lsb_release &> /dev/null; then
echo "Installing lsb-release..."
apt update && apt install -y lsb-release
fi
UBUNTU_VERSION=$(lsb_release -rs)
echo "Detected Ubuntu version: $UBUNTU_VERSION"
if [[ "$UBUNTU_VERSION" != "22.04" && "$UBUNTU_VERSION" != "20.04" ]]; then
echo "Warning: This script is tested on Ubuntu 22.04. Other versions may work but are not guaranteed."
fi
echo ""
echo "Configuring inotify watch limit for large workspaces..."
TARGET_INOTIFY_WATCHES=524288
CURRENT_INOTIFY_WATCHES=0
if [ -r /proc/sys/fs/inotify/max_user_watches ]; then
CURRENT_INOTIFY_WATCHES=$(cat /proc/sys/fs/inotify/max_user_watches)
fi
if [ "$CURRENT_INOTIFY_WATCHES" -lt "$TARGET_INOTIFY_WATCHES" ]; then
if grep -q '^fs.inotify.max_user_watches' /etc/sysctl.conf 2>/dev/null; then
sed -i "s/^fs\.inotify\.max_user_watches=.*/fs.inotify.max_user_watches=${TARGET_INOTIFY_WATCHES}/" /etc/sysctl.conf
else
echo "fs.inotify.max_user_watches=${TARGET_INOTIFY_WATCHES}" >> /etc/sysctl.conf
fi
if sysctl -w fs.inotify.max_user_watches="${TARGET_INOTIFY_WATCHES}" >/dev/null 2>&1; then
echo "Set fs.inotify.max_user_watches=${TARGET_INOTIFY_WATCHES} (consumes up to ~540 MiB if fully utilized)."
else
echo "Warning: Failed to apply inotify watch limit via sysctl; please verify manually."
fi
else
echo "fs.inotify.max_user_watches already set to ${CURRENT_INOTIFY_WATCHES}."
fi
# Check for NVIDIA GPU
echo ""
echo "Checking for NVIDIA GPU..."
if command -v nvidia-smi &> /dev/null; then
nvidia-smi
echo "NVIDIA GPU detected"
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | head -n 1 | tr -d ' ')
if [[ -n "$DRIVER_VERSION" ]]; then
if ! dpkg --compare-versions "$DRIVER_VERSION" ge "$REQUIRED_DRIVER_VERSION"; then
echo "Current NVIDIA driver: $DRIVER_VERSION"
echo "CUDA ${CUDA_SHORT_VERSION} Update 2 requires driver ${REQUIRED_DRIVER_VERSION}+."
echo "This script will upgrade it automatically."
else
echo "NVIDIA driver version: $DRIVER_VERSION (compatible with CUDA ${CUDA_SHORT_VERSION} Update 2)"
fi
fi
else
echo "NVIDIA GPU not detected. Please ensure NVIDIA drivers are installed."
exit 1
fi
GPU_COMPUTE_CAP_RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n 1 | tr -d '[:space:]')
GPU_COMPUTE_SM_NUM=$(echo "${GPU_COMPUTE_CAP_RAW}" | tr -d '.')
FLASH_ATTN_TARGET_SM="${GPU_COMPUTE_SM_NUM:-121}"
FLASH_ATTN_TARGET_ARCHS="${FLASH_ATTENTION_CUDA_ARCHS:-${FLASH_ATTN_TARGET_SM}}"
export FLASH_ATTN_TARGET_SM FLASH_ATTN_TARGET_ARCHS
if [[ -n "${GPU_COMPUTE_SM_NUM}" ]]; then
echo "Detected GPU compute capability: sm_${GPU_COMPUTE_SM_NUM}"
SOURCE_BUILD_ALLOWED=0
export GPU_COMPUTE_SM_NUM
else
SOURCE_BUILD_ALLOWED=0
echo "Could not detect GPU compute capability; defaulting to prebuilt wheels only."
fi
# Use current arch for CUTLASS build targets (fallback to defaults if unknown)
if [ -n "${GPU_COMPUTE_SM_NUM}" ]; then
CUTLASS_NVCC_ARCHS_VALUE="${GPU_COMPUTE_SM_NUM}"
else
CUTLASS_NVCC_ARCHS_VALUE="${CUTLASS_NVCC_ARCHS_VALUE_DEFAULT}"
fi
if [ -n "${GPU_COMPUTE_SM_NUM}" ]; then
sm_len=${#GPU_COMPUTE_SM_NUM}
sm_major="${GPU_COMPUTE_SM_NUM:0:$((sm_len-1))}"
sm_minor="${GPU_COMPUTE_SM_NUM: -1}"
# Use Blackwell 'a' suffix to enable TMA instructions in ptxas
if [ "${GPU_COMPUTE_SM_NUM}" -eq 100 ]; then
TE_TORCH_ARCH_LIST="${sm_major}.${sm_minor}a"
TE_CUTLASS_ARCHS="100a"
else
TE_TORCH_ARCH_LIST="${sm_major}.${sm_minor}"
TE_CUTLASS_ARCHS="${GPU_COMPUTE_SM_NUM}"
fi
else
TE_TORCH_ARCH_LIST="10.0"
TE_CUTLASS_ARCHS="${CUTLASS_NVCC_ARCHS_VALUE_DEFAULT}"
fi
# Ensure open kernel modules are enabled for Blackwell GPUs
MODPROBE_CONF="/etc/modprobe.d/nvidia-open.conf"
if [[ ! -f "$MODPROBE_CONF" ]] || ! grep -q "NVreg_OpenRmEnableUnsupportedGpus=1" "$MODPROBE_CONF"; then
echo "Configuring NVIDIA open kernel modules for Blackwell GPUs..."
cat <<'EOF' > "$MODPROBE_CONF"
options nvidia NVreg_OpenRmEnableUnsupportedGpus=1 NVreg_RestrictProfilingToAdminUsers=0
EOF
update-initramfs -u
if lsmod | grep -q "^nvidia"; then
echo "Reloading NVIDIA kernel modules to enable profiling counters..."
systemctl stop nvidia-persistenced >/dev/null 2>&1 || true
for module in nvidia_uvm nvidia_peermem nvidia_modeset nvidia_drm nvidia; do
if lsmod | grep -q "^${module}"; then
modprobe -r "${module}" >/dev/null 2>&1 || true
fi
done
modprobe nvidia NVreg_OpenRmEnableUnsupportedGpus=1 NVreg_RestrictProfilingToAdminUsers=0 >/dev/null 2>&1 || true
for module in nvidia_modeset nvidia_uvm nvidia_peermem; do
modprobe "${module}" >/dev/null 2>&1 || true
done
systemctl start nvidia-persistenced >/dev/null 2>&1 || true
fi
fi
# Update system packages
echo ""
echo "Updating system packages..."
# Fix apt_pkg module before apt update (if Python was upgraded)
if ! python3 -c "import apt_pkg" 2>/dev/null; then
echo "Fixing apt_pkg module..."
apt install -y --reinstall python3-apt 2>/dev/null || true
fi
# Disable command-not-found APT hook if it's causing issues with Python upgrade
if [ -f /etc/apt/apt.conf.d/50command-not-found ] && ! /usr/lib/cnf-update-db 2>/dev/null; then
echo "Disabling problematic command-not-found APT hook..."
rm -f /etc/apt/apt.conf.d/50command-not-found
fi