Skip to content

Commit 674dde8

Browse files
keivenchangcsabakecskemeti
authored andcommitted
fix: sanity_check.py 1) sglang Python site-packages check 2) adding HuggingFace cache checking (ai-dynamo#3890)
Signed-off-by: Keiven Chang <[email protected]>
1 parent 214ae89 commit 674dde8

File tree

2 files changed

+228
-10
lines changed

2 files changed

+228
-10
lines changed

deploy/dynamo_check.py

Lines changed: 0 additions & 1 deletion
This file was deleted.

deploy/sanity_check.py

Lines changed: 228 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,17 @@
1313
- LLM frameworks (vllm, sglang, tensorrt_llm)
1414
- Dynamo runtime and framework components
1515
- File system (permissions and disk space, detailed with --thorough-check)
16+
- HuggingFace model cache (detailed with --thorough-check)
1617
- Installation status and component availability
1718
19+
IMPORTANT: This script is STANDALONE and uses only Python stdlib (no Dynamo components).
20+
21+
Why: Must work before Dynamo is built/installed (CI, fresh containers, build failures).
22+
This tool is for pre-deployment validation; dynamo.common.config_dump is for runtime.
23+
24+
Hard-coded paths: Uses defaults (e.g., ~/.cache/huggingface/hub) for predictable
25+
behavior even when environment variables are misconfigured. See class docs for details.
26+
1827
The output uses status indicators:
1928
- ✅ Component found and working
2029
- ❌ Component missing or error
@@ -35,13 +44,17 @@
3544
├─ OS Ubuntu 24.04.1 LTS (Noble Numbat) (Linux 6.11.0-28-generic x86_64), Memory=26.7/125.5 GiB, Cores=32
3645
├─ User info: user=ubuntu, uid=1000, gid=1000
3746
├─ ✅ NVIDIA GPU NVIDIA RTX 6000 Ada Generation, driver 570.133.07, CUDA 12.8, Power=26.14/300.00 W, Memory=289/49140 MiB
47+
├─ 🤖Framework
48+
│ ├─ ✅ vLLM: 0.10.1.1, module=/opt/vllm/vllm/__init__.py, exec=/opt/dynamo/venv/bin/vllm
49+
│ └─ ✅ Sglang: 0.3.0, module=/opt/sglang/sglang/__init__.py
3850
├─ File System
3951
│ ├─ ✅ Dynamo workspace ($HOME/dynamo) writable
4052
│ ├─ ✅ Dynamo .git directory writable
4153
│ ├─ ✅ Rustup home ($HOME/.rustup) writable
4254
│ ├─ ✅ Cargo home ($HOME/.cargo) writable
4355
│ ├─ ✅ Cargo target ($HOME/dynamo/.build/target) writable
4456
│ └─ ✅ Python site-packages ($HOME/dynamo/venv/lib/python3.12/site-packages) writable
57+
├─ ✅ Hugging Face Cache 3 models in ~/.cache/huggingface/hub
4558
├─ ✅ Cargo $HOME/.cargo/bin/cargo, cargo 1.89.0 (c24e10642 2025-06-23)
4659
│ ├─ Cargo home directory CARGO_HOME=$HOME/.cargo
4760
│ └─ Cargo target directory CARGO_TARGET_DIR=$HOME/dynamo/.build/target
@@ -52,9 +65,6 @@
5265
├─ ✅ Python 3.12.3, /opt/dynamo/venv/bin/python
5366
│ ├─ ✅ PyTorch 2.7.1+cu128, ✅torch.cuda.is_available
5467
│ └─ PYTHONPATH not set
55-
├─ 🤖Framework
56-
│ ├─ ✅ vLLM: 0.10.1.1, module=/opt/vllm/vllm/__init__.py, exec=/opt/dynamo/venv/bin/vllm
57-
│ └─ ✅ Sglang: 0.3.0, module=/opt/sglang/sglang/__init__.py
5868
└─ Dynamo $HOME/dynamo, SHA: a03d29066, Date: 2025-08-30 16:22:29 PDT
5969
├─ ✅ Runtime components ai-dynamo-runtime 0.4.1
6070
│ │ /opt/dynamo/venv/lib/python3.12/site-packages/ai_dynamo_runtime-0.4.1.dist-info: created=2025-08-30 19:14:29 PDT
@@ -79,8 +89,8 @@
7989
python deploy/sanity_check.py [--thorough-check] [--terse]
8090
8191
Options:
82-
--thorough-check Enable thorough checking (file permissions, directory sizes, etc.)
83-
--terse Enable terse output mode
92+
--thorough-check Enable thorough checking (file permissions, directory sizes, HuggingFace model details)
93+
--terse Enable terse output mode (show only essential info and errors)
8494
"""
8595

8696
import datetime
@@ -324,6 +334,9 @@ def __init__(
324334
# Add file permissions check
325335
self.add_child(FilePermissionsInfo(thorough_check=self.thorough_check))
326336

337+
# Add HuggingFace cache check
338+
self.add_child(HuggingFaceInfo(thorough_check=self.thorough_check))
339+
327340
# Add Cargo (always show, even if not found)
328341
self.add_child(CargoInfo(thorough_check=self.thorough_check))
329342

@@ -1103,7 +1116,14 @@ def _check_dynamo_directory_permissions(self):
11031116
)
11041117

11051118
def _check_site_packages_permissions(self):
1106-
"""Check site-packages directory writability"""
1119+
"""Check site-packages directory writability
1120+
1121+
Logic:
1122+
- If running in a virtualenv and its site-packages is writable: PASS
1123+
(system site-packages being read-only is expected and shown as WARNING)
1124+
- If no virtualenv and no writable site-packages: ERROR
1125+
(can't install packages anywhere)
1126+
"""
11071127
try:
11081128
import site
11091129

@@ -1113,15 +1133,33 @@ def _check_site_packages_permissions(self):
11131133
if user_site:
11141134
site_packages_dirs.append(user_site)
11151135

1116-
# Check each existing site-packages directory
1136+
# First pass: check which directories are writable
1137+
writable_dirs = []
1138+
all_results = []
11171139
recursive = self.thorough_check
1140+
11181141
for site_dir in site_packages_dirs:
11191142
if os.path.exists(site_dir):
11201143
results = self._check_permissions_unified(
11211144
[site_dir], "site-packages", recursive=recursive
11221145
)
1123-
for result in results:
1124-
self.add_child(result)
1146+
all_results.append((site_dir, results))
1147+
1148+
# Check if this directory is writable
1149+
if results and results[0].status == NodeStatus.OK:
1150+
writable_dirs.append(site_dir)
1151+
1152+
# Determine if we have at least one writable site-packages
1153+
has_writable_site_packages = len(writable_dirs) > 0
1154+
1155+
# Second pass: add results with adjusted status
1156+
for site_dir, results in all_results:
1157+
for result in results:
1158+
# If we have at least one writable site-packages,
1159+
# downgrade ERROR to WARNING for non-writable ones
1160+
if has_writable_site_packages and result.status == NodeStatus.ERROR:
1161+
result.status = NodeStatus.WARNING
1162+
self.add_child(result)
11251163

11261164
except Exception as e:
11271165
self.add_child(
@@ -1227,6 +1265,187 @@ def format_bytes(bytes_val):
12271265
return "", None
12281266

12291267

1268+
class HuggingFaceInfo(NodeInfo):
1269+
"""Hugging Face models cache information (follows standalone requirement)
1270+
1271+
HARD-CODED PATH: ~/.cache/huggingface/hub
1272+
1273+
ENV VARIABLES (checked by HuggingFace transformers library, not this tool):
1274+
- HF_HOME: Base directory for Hugging Face cache
1275+
- HUGGINGFACE_HUB_CACHE: Direct path to hub cache
1276+
- HF_TOKEN: Authentication token (checked and displayed if set)
1277+
1278+
This class directly uses ~/.cache/huggingface/hub instead of reading environment
1279+
variables because this tool must work reliably in all environments, including when
1280+
environment variables are misconfigured or not set. For dynamic configuration that
1281+
respects all HF environment variables, use dynamo.common.config_dump at runtime.
1282+
"""
1283+
1284+
def __init__(self, thorough_check: bool = False):
1285+
# HARD-CODED PATH: ~/.cache/huggingface/hub (not reading HF_HOME or HUGGINGFACE_HUB_CACHE)
1286+
hf_cache_path = os.path.expanduser("~/.cache/huggingface/hub")
1287+
1288+
if os.path.exists(hf_cache_path):
1289+
models = self._get_cached_models(
1290+
hf_cache_path, compute_sizes=thorough_check
1291+
)
1292+
if models:
1293+
self._init_with_models(hf_cache_path, models, thorough_check)
1294+
else:
1295+
self._init_no_models_found(hf_cache_path)
1296+
else:
1297+
self._init_cache_not_available()
1298+
1299+
# Add HF_TOKEN info if set (common to all cases)
1300+
self._add_hf_token_info()
1301+
1302+
def _init_with_models(
1303+
self, hf_cache_path: str, models: List[tuple], thorough_check: bool
1304+
):
1305+
"""Initialize when models are found in cache."""
1306+
model_count = len(models)
1307+
display_path = self._replace_home_with_var(hf_cache_path)
1308+
super().__init__(
1309+
label="Hugging Face Cache",
1310+
desc=f"{model_count} models in {display_path}",
1311+
status=NodeStatus.OK,
1312+
)
1313+
1314+
# Only show detailed model list in thorough mode
1315+
if thorough_check:
1316+
self._add_model_details(models)
1317+
1318+
def _init_no_models_found(self, hf_cache_path: str):
1319+
"""Initialize when cache exists but no models found."""
1320+
display_path = self._replace_home_with_var(hf_cache_path)
1321+
super().__init__(
1322+
label="Hugging Face Cache",
1323+
desc=f"directory exists but no models found in {display_path}",
1324+
status=NodeStatus.WARNING,
1325+
)
1326+
1327+
def _init_cache_not_available(self):
1328+
"""Initialize when cache directory doesn't exist."""
1329+
super().__init__(
1330+
label="Hugging Face Cache",
1331+
desc="~/.cache/huggingface/hub not available",
1332+
status=NodeStatus.WARNING,
1333+
)
1334+
1335+
def _add_model_details(self, models: List[tuple]):
1336+
"""Add detailed model information as child nodes."""
1337+
# Add all models as children (no limit)
1338+
for i, model_info in enumerate(models):
1339+
model_name, download_date, size_str = model_info
1340+
model_node = NodeInfo(
1341+
label=f"Model {i+1}",
1342+
desc=f"{model_name}, downloaded={download_date}, size={size_str}",
1343+
status=NodeStatus.INFO,
1344+
)
1345+
self.add_child(model_node)
1346+
1347+
def _add_hf_token_info(self):
1348+
"""Add HF_TOKEN information if the environment variable is set."""
1349+
if os.environ.get("HF_TOKEN"):
1350+
token_node = NodeInfo(
1351+
label="HF_TOKEN",
1352+
desc="<set>",
1353+
status=NodeStatus.INFO,
1354+
)
1355+
self.add_child(token_node)
1356+
1357+
def _get_cached_models(self, cache_path: str, compute_sizes: bool) -> List[tuple]:
1358+
"""Get list of cached Hugging Face models with metadata.
1359+
1360+
Args:
1361+
cache_path: Path to HuggingFace cache directory
1362+
compute_sizes: Whether to compute directory sizes (slow operation)
1363+
1364+
Returns:
1365+
List of tuples: (model_name, download_date, size_str)
1366+
"""
1367+
models = []
1368+
try:
1369+
if os.path.exists(cache_path):
1370+
for item in os.listdir(cache_path):
1371+
item_path = os.path.join(cache_path, item)
1372+
# Only count model repos; ignore datasets--, spaces--, blobs, etc.
1373+
if not (os.path.isdir(item_path) and item.startswith("models--")):
1374+
continue
1375+
# Convert "models--org--repo-name" to "org/repo-name"
1376+
parts = item.split("--")
1377+
if len(parts) >= 3:
1378+
org = parts[1]
1379+
model_name = "--".join(parts[2:]) # Preserve dashes
1380+
display_name = f"{org}/{model_name}"
1381+
else:
1382+
display_name = item # Fallback to raw dir name
1383+
1384+
# Get download date (directory creation/modification time)
1385+
try:
1386+
stat_info = os.stat(item_path)
1387+
# Use the earlier of creation time or modification time
1388+
download_time = min(stat_info.st_ctime, stat_info.st_mtime)
1389+
download_date = self._format_timestamp_pdt(download_time)
1390+
except Exception:
1391+
download_date = "unknown"
1392+
1393+
# Get directory size (only when requested)
1394+
size_str = "-"
1395+
if compute_sizes:
1396+
try:
1397+
size_bytes = self._get_directory_size_bytes(item_path)
1398+
size_str = self._format_size(size_bytes)
1399+
except Exception:
1400+
size_str = "unknown"
1401+
1402+
models.append((display_name, download_date, size_str))
1403+
except Exception:
1404+
pass
1405+
1406+
# Sort by model name
1407+
return sorted(models, key=lambda x: x[0])
1408+
1409+
def _get_directory_size_bytes(self, directory: str) -> int:
1410+
"""Get the total size of a directory in bytes."""
1411+
total_size = 0
1412+
try:
1413+
for dirpath, dirnames, filenames in os.walk(directory):
1414+
for filename in filenames:
1415+
filepath = os.path.join(dirpath, filename)
1416+
try:
1417+
if not os.path.islink(filepath): # Skip symbolic links
1418+
total_size += os.path.getsize(filepath)
1419+
except (OSError, FileNotFoundError):
1420+
pass # Skip files that can't be accessed
1421+
except Exception:
1422+
pass
1423+
return total_size
1424+
1425+
def _format_size(self, size_bytes: int) -> str:
1426+
"""Format size in bytes to human readable format."""
1427+
if size_bytes == 0:
1428+
return "0 B"
1429+
1430+
units = ["B", "KB", "MB", "GB", "TB"]
1431+
size = float(size_bytes)
1432+
unit_index = 0
1433+
1434+
while size >= 1024.0 and unit_index < len(units) - 1:
1435+
size /= 1024.0
1436+
unit_index += 1
1437+
1438+
# Format with appropriate precision
1439+
if unit_index == 0: # Bytes
1440+
return f"{int(size)} {units[unit_index]}"
1441+
elif size >= 100:
1442+
return f"{size:.0f} {units[unit_index]}"
1443+
elif size >= 10:
1444+
return f"{size:.1f} {units[unit_index]}"
1445+
else:
1446+
return f"{size:.2f} {units[unit_index]}"
1447+
1448+
12301449
class CargoInfo(NodeInfo):
12311450
"""Cargo tool information"""
12321451

0 commit comments

Comments
 (0)