ai-dynamo · tianhaox · Dec 3, 2025 · Nov 22, 2025 · Nov 24, 2025 · Nov 24, 2025
@@ -21,6 +21,7 @@
 /src/aiconfigurator/sdk/perf_database.py @Arsene12358 @YijiaZhao @ilyasher @xutizhou @AichenF @tianhaox @jasonqinzhou
 /src/aiconfigurator/sdk/task.py @tianhaox @jasonqinzhou @ilyasher @Arsene12358
 /src/aiconfigurator/sdk/utils.py @tianhaox @jasonqinzhou @simone-chen @Arsene12358
+/src/aiconfigurator/sdk/suppport_matrix.py @Harrilee
 
 # cli
 /src/aiconfigurator/cli @Ethan-ES @tianhaox @jasonqinzhou @Arsene12358
@@ -71,6 +72,7 @@
 /tools/automation @tianhaox @Ethan-ES @jasonqinzhou @Arsene12358
 /tools/sanity_check @tianhaox @YijiaZhao @jasonqinzhou @Arsene12358
 /tools/simple_sdk_demo @tianhaox @jasonqinzhou @Arsene12358
+/tools/support_matrix/generate_support_matrix.py @Harrilee
 
 # misc
 /ATTRIBUTIONS.md @saturley-hall
@@ -85,4 +87,4 @@
 /.pre-commit-config.yaml @saturley-hall
 
 # CI/CD and workflows
-/.github/workflows @saturley-hall @tianhaox @jasonqinzhou @Arsene12358
+/.github/workflows @saturley-hall @tianhaox @jasonqinzhou @Arsene12358  @Harrilee
@@ -0,0 +1,30 @@
+name: "Daily Support Matrix Test"
+
+on:
+  schedule:
+    # Run daily at 7 AM PT / 11 PM China / 3 PM UTC
+    - cron: "0 15 * * *"
+  workflow_dispatch: # Allow manual trigger (with no inputs)
+
+jobs:
+  test-support-matrix:
+    name: Test Support Matrix
+    runs-on: ubuntu-latest
+    timeout-minutes: 480 # 8 hours
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Git LFS Pull
+        run: git lfs pull
+
+      - name: Build test container
+        run: |
+          docker build -f docker/Dockerfile -t aiconfigurator:test --target test .
+
+      - name: Run support matrix tests in container
+        run: |
+          docker run --name aic-support-matrix --env TEST_SUPPORT_MATRIX=true aiconfigurator:test \
+            pytest tests/sdk/support_matrix/test_support_matrix.py -v --tb=short
@@ -305,7 +305,11 @@ To go through the process, refer to the [guidance](collector/README.md) under th
 | gb200_sxm | TRTLLM(1.0.0rc6) | ✅ |
 | a100_sxm | TRTLLM(1.0.0) | ✅ |
 
-> **Note**: b200 and gb200 are under dev. Results are to be aligned. For preview now. 
+> **Note**: b200 and gb200 are under dev. Results are to be aligned. For preview now.
+
+#### Detailed Support Matrix
+
+For a comprehensive breakdown of which model/system/backend/version combinations are supported in both aggregated and disaggregated modes, refer to the [**support matrix CSV**](src/aiconfigurator/systems/support_matrix.csv). This file is automatically generated and tested to ensure accuracy across all supported configurations.
 
 ## Contributing and Development
 

@@ -33,3 +33,4 @@ RUN WHL=$(ls -d /wheelhouse/*) && \
 COPY pytest.ini /workspace/
 COPY tests/ /workspace/tests/
 COPY src/ /workspace/src/
+COPY tools/ /workspace/tools/
@@ -54,6 +54,7 @@ dependencies = [
     "pydantic~=2.11.4",
     "pyyaml>=6.0",
     "scipy>=1.13.1",
+    "tqdm>=4.0.0",
     "uvicorn>=0.34.2",
     "bokeh",
     "nvidia-ml-py",

@@ -431,6 +431,7 @@ def _get_summary_df(
             Get all worker candidates based on give search space
             """
             summary_df = pd.DataFrame(columns=common.ColumnsStatic)
+            exceptions = []
 
             for parallel_config in parallel_config_list:
                 tp_size, pp_size, dp_size, moe_tp_size, moe_ep_size = parallel_config
@@ -480,13 +481,18 @@ def _get_summary_df(
                             )
                         else:  # larger b will always OOM
                             break
-                except Exception:
+                except Exception as e:
                     logger.exception(
                         f"Error getting candidate workers with parallel config: "
                         f"tp={tp_size}, pp={pp_size}, dp={dp_size}, moe_tp={moe_tp_size}, "
                         f"moe_ep={moe_ep_size}; skipping this combination"
                     )
+                    exceptions.append(e)
                     continue
+            if summary_df.empty:
+                raise RuntimeError(
+                    f"No results found for any parallel configuration. Showing last exception: {exceptions[-1]}"
+                ) from exceptions[-1]
             return summary_df
 
         def _find_best_result_under_constraints(

@@ -113,6 +113,7 @@ def agg_pareto(
 
     # agg is agg server, the loop over parallel is outside here.
     results_df = pd.DataFrame(columns=ColumnsAgg)
+    exceptions = []
     for parallel_config in parallel_config_list:
         tp_size, pp_size, dp_size, moe_tp_size, moe_ep_size = parallel_config
         logger.debug(
@@ -151,8 +152,8 @@ def agg_pareto(
                     results_df = result_df
                 else:
                     results_df = pd.concat([results_df, result_df], axis=0, ignore_index=True)
-        except Exception:
-            logger.exception(
+        except Exception as e:
+            logger.info(
                 "Error getting candidate workers with parallel config: tp=%s, pp=%s, dp=%s, "
                 "moe_tp=%s, moe_ep=%s, skip this combination",
                 tp_size,
@@ -161,8 +162,15 @@ def agg_pareto(
                 moe_tp_size,
                 moe_ep_size,
             )
+            exceptions.append(e)
             continue
 
+    # If no results found, raise the last exception
+    if results_df.empty:
+        raise RuntimeError(
+            f"No results found for any parallel configuration. Showing last exception: {exceptions[-1]}"
+        ) from exceptions[-1]
+
     results_df = results_df.sort_values(by="tokens/s/gpu", ascending=False).reset_index(drop=True)
 
     return results_df