Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
/src/aiconfigurator/sdk/perf_database.py @Arsene12358 @YijiaZhao @ilyasher @xutizhou @AichenF
/src/aiconfigurator/sdk/task.py @tianhaox @jasonqinzhou @ilyasher
/src/aiconfigurator/sdk/utils.py @tianhaox @jasonqinzhou @simone-chen
/src/aiconfigurator/sdk/suppport_matrix.py @Harrilee

# cli
/src/aiconfigurator/cli @Ethan-ES @tianhaox
Expand Down Expand Up @@ -71,7 +72,7 @@
/tools/automation @tianhaox @Ethan-ES
/tools/sanity_check @tianhaox @YijiaZhao
/tools/simple_sdk_demo @tianhaox @jasonqinzhou

/tools/generate_support_matrix.py @Harrilee
# misc
/ATTRIBUTIONS.md @saturley-hall
/CODE_OF_CONDUCT.md @saturley-hall
Expand All @@ -85,4 +86,4 @@
/.pre-commit-config.yaml @saturley-hall

# CI/CD and workflows
/.github/workflows @saturley-hall @tianhaox @jasonqinzhou
/.github/workflows @saturley-hall @tianhaox @jasonqinzhou @Harrilee
30 changes: 30 additions & 0 deletions .github/workflows/daily-support-matrix.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: "Daily Support Matrix Test"

on:
schedule:
# Run daily at 7 AM PT / 11 PM China / 3 PM UTC
- cron: "0 15 * * *"
workflow_dispatch: # Allow manual trigger (with no inputs)

jobs:
test-support-matrix:
name: Test Support Matrix
runs-on: ubuntu-latest
timeout-minutes: 480 # 8 hours
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true

- name: Git LFS Pull
run: git lfs pull

- name: Build test container
run: |
docker build -f docker/Dockerfile -t aiconfigurator:test --target test .

- name: Run support matrix tests in container
run: |
docker run --name aic-support-matrix --env TEST_SUPPORT_MATRIX=true aiconfigurator:test \
pytest tests/sdk/support_matrix/test_support_matrix.py -v --tb=short
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,11 @@ To go through the process, refer to the [guidance](collector/README.md) under th
| gb200_sxm | TRTLLM(1.0.0rc6) | ✅ |
| a100_sxm | TRTLLM(1.0.0) | ✅ |

> **Note**: b200 and gb200 are under dev. Results are to be aligned. For preview now.
> **Note**: b200 and gb200 are under dev. Results are to be aligned. For preview now.

#### Detailed Support Matrix

For a comprehensive breakdown of which model/system/backend/version combinations are supported in both aggregated and disaggregated modes, refer to the [**support matrix CSV**](src/aiconfigurator/systems/support_matrix.csv). This file is automatically generated and tested to ensure accuracy across all supported configurations.

## Contributing and Development

Expand Down
1 change: 1 addition & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ RUN WHL=$(ls -d /wheelhouse/*) && \
COPY pytest.ini /workspace/
COPY tests/ /workspace/tests/
COPY src/ /workspace/src/
COPY tools/ /workspace/tools/
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ dependencies = [
"pydantic~=2.11.4",
"pyyaml>=6.0",
"scipy>=1.13.1",
"tqdm>=4.0.0",
"uvicorn>=0.34.2",
"bokeh",
"nvidia-ml-py",
Expand Down
250 changes: 250 additions & 0 deletions src/aiconfigurator/sdk/suppport_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import csv
import logging

from tqdm import tqdm

from aiconfigurator.sdk import common, perf_database
from aiconfigurator.sdk.task import TaskConfig, TaskRunner

logger = logging.getLogger(__name__)

# Test configuration constants
TOTAL_GPUS = 128
ISL = 4000
OSL = 500
PREFIX = 500
TTFT = 2000.0
TPOT = 50.0


class SupportMatrix:
def __init__(self):
self.models: set[str] = self.get_models()
# database structure: {system: {backend: {version}}}
self.databases: dict[str, dict[str, dict[str, str]]] = self.load_databases()

def get_models(self):
return set[str](common.SupportedModels.keys())

def get_systems(self):
return set(common.SupportedSystems)

def get_backends(self):
return set(x.value for x in common.BackendName)

def load_databases(self):
return perf_database.get_all_databases()

def __get_hardware_and_backend_combinations(self) -> list[tuple[str, str, str]]:
"""
Iterate over all combinations of hardware, and inference backend, version.
"""
for hardware in self.get_systems():
for backend in self.get_backends():
for version in self.databases[hardware][backend]:
yield hardware, backend, version

def __get_model_and_hardware_and_backend_combinations(self) -> list[tuple[str, str, str, str]]:
"""
Iterate over all combinations of models, hardware, and inference backend, version.
"""
for hardware, backend, version in self.__get_hardware_and_backend_combinations():
for model in self.models:
yield model, hardware, backend, version

def generate_combinations(self):
"""
Generate all combinations of models, hardware, and inference backend, version.
"""
# get all combinations of hardware, and inference backend, version
combinations = list(self.__get_model_and_hardware_and_backend_combinations())
return combinations

def run_single_test(
self,
model: str,
system: str,
backend: str,
version: str,
) -> tuple[dict[str, bool], dict[str, str | None]]:
"""
Run a single configuration test for both agg and disagg modes.

Args:
model: Model name
system: System/hardware name
backend: Backend name
version: Backend version

Returns:
Tuple of (dict with results, dict with error messages)
Both dicts have keys "agg" and "disagg"
"""
modes_to_test = ["agg", "disagg"]
results = {}
error_messages = {}

for mode in modes_to_test:
try:
# Create TaskConfig for the test
task_config_kwargs = {
"serving_mode": mode,
"model_name": model,
"system_name": system,
"backend_name": backend,
"backend_version": version,
"total_gpus": TOTAL_GPUS,
"isl": ISL,
"osl": OSL,
"prefix": PREFIX,
"ttft": TTFT,
"tpot": TPOT,
}

# For disagg mode, set decode_system_name
if mode == "disagg":
task_config_kwargs["decode_system_name"] = system

task_config = TaskConfig(**task_config_kwargs)

# Run the configuration
runner = TaskRunner()
result = runner.run(task_config)

# Check if we got valid results
pareto_frontier_df = result.get("pareto_frontier_df")
if pareto_frontier_df is not None and not pareto_frontier_df.empty:
results[mode] = True
error_messages[mode] = None
else:
logger.debug(
"Configuration returned no results: %s, %s, %s, %s, mode=%s",
model,
system,
backend,
version,
mode,
)
results[mode] = False
error_messages[mode] = "Configuration returned no results"

except Exception as e:
logger.debug(
"Configuration failed: %s, %s, %s, %s, mode=%s - Error: %s",
model,
system,
backend,
version,
mode,
str(e),
)
results[mode] = False
error_messages[mode] = str(e)

return results, error_messages

def test_support_matrix(self) -> list[tuple[str, str, str, str, str, bool, str | None]]:
"""
Test whether each combination is supported by AIC.
Tests both agg and disagg modes for each combination and captures error messages.

Returns:
List of tuples (model, system, backend, version, mode, success, err_msg)
Returns separate entries for agg and disagg modes
"""
# Print configuration
print("\n" + "=" * 80)
print("AIConfigurator Support Matrix Test")
print("=" * 80)
print("Testing both agg and disagg modes for all combinations")
print(f"Total GPUs: {TOTAL_GPUS}")
print(f"Input Sequence Length (ISL): {ISL}")
print(f"Output Sequence Length (OSL): {OSL}")
print(f"Prefix: {PREFIX}")
print(f"Target TTFT: {TTFT}ms")
print(f"Target TPOT: {TPOT}ms")
print("=" * 80 + "\n")

combinations = self.generate_combinations()
results = []

# Use tqdm for progress tracking
for model, system, backend, version in tqdm(
combinations,
desc="Testing support matrix",
unit="config",
):
success_dict, error_dict = self.run_single_test(
model=model,
system=system,
backend=backend,
version=version,
)

# Add separate entries for agg and disagg modes
for mode in success_dict:
results.append((model, system, backend, version, mode, success_dict[mode], error_dict[mode]))

# Print results summary
self._print_results_summary(results)

return results

def _print_results_summary(self, results: list[tuple[str, str, str, str, str, bool, str | None]]) -> None:
"""Print summary of test results."""
total_tests = len(results)
passed = sum(1 for _, _, _, _, _, success, _ in results if success)
failed = total_tests - passed

print("\n" + "=" * 80)
print("Test Results Summary")
print("=" * 80)
print(f"Total configurations tested: {total_tests}")
print(f"✓ Passed: {passed} ({100 * passed / total_tests:.1f}%)")
print(f"✗ Failed: {failed} ({100 * failed / total_tests:.1f}%)")
print("=" * 80)

# Group results by status
passed_configs = []
failed_configs = []

for model, system, backend, version, mode, success, err_msg in results:
config = (model, system, backend, version, mode)
if success:
passed_configs.append(config)
else:
failed_configs.append(config)

# Print passed configurations
if passed_configs:
print(f"\n✓ Passed Configurations ({len(passed_configs)}):")
for model, system, backend, version, mode in sorted(passed_configs):
print(f" • {model} on {system} with {backend} v{version} ({mode})")

# Print failed configurations
if failed_configs:
print(f"\n✗ Failed Configurations ({len(failed_configs)}):")
for model, system, backend, version, mode in sorted(failed_configs):
print(f" • {model} on {system} with {backend} v{version} ({mode})")

def save_results_to_csv(
self, results: list[tuple[str, str, str, str, str, bool, str | None]], output_file: str
) -> None:
"""
Save test results to a CSV file.

Args:
results: List of tuples (model, system, backend, version, mode, success, err_msg)
output_file: Path to the output CSV file
"""

with open(output_file, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Model", "System", "Backend", "Version", "Mode", "Status", "ErrMsg"])
for model, system, backend, version, mode, success, err_msg in results:
status = "PASS" if success else "FAIL"
writer.writerow([model, system, backend, version, mode, status, err_msg or ""])
print(f"\nResults saved to: {output_file}")
Loading