Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
/src/aiconfigurator/sdk/perf_database.py @Arsene12358 @YijiaZhao @ilyasher @xutizhou @AichenF @tianhaox @jasonqinzhou
/src/aiconfigurator/sdk/task.py @tianhaox @jasonqinzhou @ilyasher @Arsene12358
/src/aiconfigurator/sdk/utils.py @tianhaox @jasonqinzhou @simone-chen @Arsene12358
/src/aiconfigurator/sdk/suppport_matrix.py @Harrilee

# cli
/src/aiconfigurator/cli @Ethan-ES @tianhaox @jasonqinzhou @Arsene12358
Expand Down Expand Up @@ -71,6 +72,7 @@
/tools/automation @tianhaox @Ethan-ES @jasonqinzhou @Arsene12358
/tools/sanity_check @tianhaox @YijiaZhao @jasonqinzhou @Arsene12358
/tools/simple_sdk_demo @tianhaox @jasonqinzhou @Arsene12358
/tools/support_matrix/generate_support_matrix.py @Harrilee

# misc
/ATTRIBUTIONS.md @saturley-hall
Expand All @@ -85,4 +87,4 @@
/.pre-commit-config.yaml @saturley-hall

# CI/CD and workflows
/.github/workflows @saturley-hall @tianhaox @jasonqinzhou @Arsene12358
/.github/workflows @saturley-hall @tianhaox @jasonqinzhou @Arsene12358 @Harrilee
30 changes: 30 additions & 0 deletions .github/workflows/daily-support-matrix.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: "Daily Support Matrix Test"

on:
schedule:
# Run daily at 7 AM PT / 11 PM China / 3 PM UTC
- cron: "0 15 * * *"
workflow_dispatch: # Allow manual trigger (with no inputs)

jobs:
test-support-matrix:
name: Test Support Matrix
runs-on: ubuntu-latest
timeout-minutes: 480 # 8 hours
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true

- name: Git LFS Pull
run: git lfs pull

- name: Build test container
run: |
docker build -f docker/Dockerfile -t aiconfigurator:test --target test .

- name: Run support matrix tests in container
run: |
docker run --name aic-support-matrix --env TEST_SUPPORT_MATRIX=true aiconfigurator:test \
pytest tests/sdk/support_matrix/test_support_matrix.py -v --tb=short
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,11 @@ To go through the process, refer to the [guidance](collector/README.md) under th
| gb200_sxm | TRTLLM(1.0.0rc6) | ✅ |
| a100_sxm | TRTLLM(1.0.0) | ✅ |

> **Note**: b200 and gb200 are under dev. Results are to be aligned. For preview now.
> **Note**: b200 and gb200 are under dev. Results are to be aligned. For preview now.

#### Detailed Support Matrix

For a comprehensive breakdown of which model/system/backend/version combinations are supported in both aggregated and disaggregated modes, refer to the [**support matrix CSV**](src/aiconfigurator/systems/support_matrix.csv). This file is automatically generated and tested to ensure accuracy across all supported configurations.

## Contributing and Development

Expand Down
1 change: 1 addition & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ RUN WHL=$(ls -d /wheelhouse/*) && \
COPY pytest.ini /workspace/
COPY tests/ /workspace/tests/
COPY src/ /workspace/src/
COPY tools/ /workspace/tools/
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ dependencies = [
"pydantic~=2.11.4",
"pyyaml>=6.0",
"scipy>=1.13.1",
"tqdm>=4.0.0",
"uvicorn>=0.34.2",
"bokeh",
"nvidia-ml-py",
Expand Down
8 changes: 7 additions & 1 deletion src/aiconfigurator/sdk/inference_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,7 @@ def _get_summary_df(
Get all worker candidates based on give search space
"""
summary_df = pd.DataFrame(columns=common.ColumnsStatic)
exceptions = []

for parallel_config in parallel_config_list:
tp_size, pp_size, dp_size, moe_tp_size, moe_ep_size = parallel_config
Expand Down Expand Up @@ -480,13 +481,18 @@ def _get_summary_df(
)
else: # larger b will always OOM
break
except Exception:
except Exception as e:
logger.exception(
f"Error getting candidate workers with parallel config: "
f"tp={tp_size}, pp={pp_size}, dp={dp_size}, moe_tp={moe_tp_size}, "
f"moe_ep={moe_ep_size}; skipping this combination"
)
exceptions.append(e)
continue
if summary_df.empty:
raise RuntimeError(
f"No results found for any parallel configuration. Showing last exception: {exceptions[-1]}"
) from exceptions[-1]
return summary_df

def _find_best_result_under_constraints(
Expand Down
12 changes: 10 additions & 2 deletions src/aiconfigurator/sdk/pareto_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def agg_pareto(

# agg is agg server, the loop over parallel is outside here.
results_df = pd.DataFrame(columns=ColumnsAgg)
exceptions = []
for parallel_config in parallel_config_list:
tp_size, pp_size, dp_size, moe_tp_size, moe_ep_size = parallel_config
logger.debug(
Expand Down Expand Up @@ -151,8 +152,8 @@ def agg_pareto(
results_df = result_df
else:
results_df = pd.concat([results_df, result_df], axis=0, ignore_index=True)
except Exception:
logger.exception(
except Exception as e:
logger.info(
"Error getting candidate workers with parallel config: tp=%s, pp=%s, dp=%s, "
"moe_tp=%s, moe_ep=%s, skip this combination",
tp_size,
Expand All @@ -161,8 +162,15 @@ def agg_pareto(
moe_tp_size,
moe_ep_size,
)
exceptions.append(e)
continue

# If no results found, raise the last exception
if results_df.empty:
raise RuntimeError(
f"No results found for any parallel configuration. Showing last exception: {exceptions[-1]}"
) from exceptions[-1]

results_df = results_df.sort_values(by="tokens/s/gpu", ascending=False).reset_index(drop=True)

return results_df
Expand Down
Loading