danielendler · danielendler · Aug 26, 2025 · Aug 7, 2025 · Aug 26, 2025 · Aug 26, 2025
@@ -0,0 +1,12 @@
+name: "CodeQL configuration for datason"
+
+# Restrict analysis to Python until Rust code is added.
+languages:
+  - python
+
+# Optionally tune paths; keep default behavior otherwise.
+paths-ignore:
+  - docs/
+  - examples/
+  - tests/
+  - scripts/
@@ -291,6 +291,11 @@ jobs:
         export PYTHONDONTWRITEBYTECODE=1
         export PYTHONOVERSAFE=1
 
+        # Only run Supported Types Matrix test in ML/full environments
+        if [ "${{ matrix.dependency-set.use-ml }}" = "true" ] || [ "${{ matrix.dependency-set.use-dev }}" = "true" ]; then
+          export RUN_SUPPORTED_TYPES=1
+        fi
+
         pytest ${{ matrix.dependency-set.test-pattern }} -v \
           --cov=datason \
           --cov-report=xml:coverage-${{ matrix.dependency-set.name }}.xml \

@@ -10,7 +10,7 @@
 from datetime import datetime
 from decimal import Decimal
 from pathlib import Path
-from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Set, Union
+from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Set, Tuple, Union
 
 try:
     import pandas as pd
@@ -74,7 +74,9 @@ def normalize_numpy_types(obj: Any) -> Any:  # Match exact signature from type_h
 _UUID_CACHE_SIZE_LIMIT = 100  # Small cache for common UUIDs
 
 # OPTIMIZATION: Collection processing cache for bulk operations
-_COLLECTION_COMPATIBILITY_CACHE: Dict[int, str] = {}  # Maps collection id to compatibility status
+# Cache homogeneity results per collection identity and size to avoid stale results
+# when a collection is mutated (e.g., empty -> single item) but retains identity.
+_COLLECTION_COMPATIBILITY_CACHE: Dict[Tuple[int, int], str] = {}
 _COLLECTION_CACHE_SIZE_LIMIT = 200  # Smaller cache for collections
 
 # OPTIMIZATION: Memory allocation optimization - Phase 1 Step 1.4
@@ -2102,6 +2104,7 @@ def _is_homogeneous_collection(
 
     # SECURITY: Check for circular references
     obj_id = id(obj)
+    obj_len = len(obj) if isinstance(obj, (list, tuple, dict)) else 0
     if obj_id in _seen_ids:
         # Circular reference detected, assume mixed to force full processing
         return "mixed"
@@ -2111,8 +2114,9 @@ def _is_homogeneous_collection(
 
     try:
         # OPTIMIZATION: Check cache first for collections we've seen before
-        if obj_id in _COLLECTION_COMPATIBILITY_CACHE:
-            return _COLLECTION_COMPATIBILITY_CACHE[obj_id]
+        cache_key = (obj_id, obj_len)
+        if cache_key in _COLLECTION_COMPATIBILITY_CACHE:
+            return _COLLECTION_COMPATIBILITY_CACHE[cache_key]
 
         homogeneity_result = None
 
@@ -2128,7 +2132,9 @@ def _is_homogeneous_collection(
                     homogeneity_result = "json_basic"
                 elif all(_is_json_basic_type_safe(v, _seen_ids, _max_check_depth - 1) for v in sample):
                     # Check if all values are JSON-basic types
-                    homogeneity_result = "json_basic"
+                    # For single-item collections, return 'single_type' to avoid overconfident
+                    # classification as 'json_basic' with insufficient sample size.
+                    homogeneity_result = "single_type" if len(sample) == 1 else "json_basic"
                 else:
                     # Check if all values are the same type
                     first_type = type(sample[0])
@@ -2143,7 +2149,8 @@ def _is_homogeneous_collection(
 
                 if all(_is_json_basic_type_safe(item, _seen_ids, _max_check_depth - 1) for item in sample_items):
                     # Check if all items are JSON-basic types
-                    homogeneity_result = "json_basic"
+                    # For single-item collections, prefer 'single_type' to avoid overclassification
+                    homogeneity_result = "single_type" if len(sample_items) == 1 else "json_basic"
                 else:
                     # Check if all items are the same type
                     first_type = type(sample_items[0])
@@ -2153,7 +2160,7 @@ def _is_homogeneous_collection(
 
         # Cache the result if we have space
         if homogeneity_result is not None and len(_COLLECTION_COMPATIBILITY_CACHE) < _COLLECTION_CACHE_SIZE_LIMIT:
-            _COLLECTION_COMPATIBILITY_CACHE[obj_id] = homogeneity_result
+            _COLLECTION_COMPATIBILITY_CACHE[cache_key] = homogeneity_result
 
         return homogeneity_result
 

@@ -0,0 +1,13 @@
+# Supported Types
+
+This file is auto-generated by `tools/gen_supported_types.py`.
+
+| Type | Round-trip? | Notes/Caveats | Test ID |
+| --- | --- | --- | --- |
+| catboost.model | ✅ |  | T001 |
+| keras.model | ✅ |  | T002 |
+| optuna.Study | ✅ |  | T003 |
+| plotly.graph_objects.Figure | ✅ |  | T004 |
+| polars.DataFrame | ✅ |  | T005 |
+| sklearn.base.BaseEstimator | ✅ |  | T006 |
+| torch.Tensor | ✅ |  | T007 |
@@ -80,6 +80,9 @@ plugins:
             merge_init_into_class: true
             show_submodules: true
 
+hooks:
+  - tools/mkdocs_hooks.py
+
 # Extensions
 markdown_extensions:
   - abbr
@@ -124,6 +127,7 @@ markdown_extensions:
 # Fixed navigation using files that actually exist
 nav:
   - Home: index.md
+  - Supported Types: supported-types.md
 
   # User Guide Section
   - User Guide:
@@ -195,6 +199,7 @@ nav:
   # Reference Documentation
   - Reference:
     - Feature Matrix: FEATURE_MATRIX.md
+    - Supported Types: supported-types.md
     - AI Usage Guide: AI_USAGE_GUIDE.md
 
   # Community & Development Section

@@ -0,0 +1,23 @@
+"""Tests for supported types matrix generation."""
+
+import os
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+
+from tools.gen_supported_types import generate_supported_types_table
+
+
+@pytest.mark.skipif(
+    not os.environ.get("RUN_SUPPORTED_TYPES"),
+    reason="Run only in full/ML CI environments to avoid noise",
+)
+def test_generate_supported_types_table(tmp_path):
+    """Table generation runs without regressions and creates file."""
+    doc_path = Path("docs/supported-types.md")
+    # Ensure function runs and writes the file; will raise on regression
+    generate_supported_types_table(doc_path=doc_path)
+    assert doc_path.exists()
@@ -0,0 +1 @@
+"""Utility scripts for datason repository."""
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""Generate supported types matrix by testing registered handlers."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+from typing import Callable
+
+# Ensure repository root is on the Python path for local execution
+REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+import datason  # noqa: F401,E402 - ensure handlers are registered
+from datason.type_registry import get_type_registry  # noqa: E402
+
+# Map registry type names to factory functions that create sample objects.
+# Each factory may raise ImportError if dependency missing.
+
+
+def _catboost_sample():
+    import catboost  # type: ignore
+
+    return catboost.CatBoostClassifier(iterations=1, depth=1, verbose=False)
+
+
+def _keras_sample():
+    import keras  # type: ignore
+
+    return keras.Sequential()
+
+
+def _optuna_sample():
+    import optuna  # type: ignore
+
+    return optuna.create_study(direction="minimize")
+
+
+def _plotly_sample():
+    import plotly.graph_objects as go  # type: ignore
+
+    return go.Figure()
+
+
+def _polars_sample():
+    import polars as pl  # type: ignore
+
+    return pl.DataFrame({"a": [1, 2]})
+
+
+def _torch_sample():
+    import torch  # type: ignore
+
+    return torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
+
+
+def _sklearn_sample():
+    from sklearn.linear_model import LinearRegression  # type: ignore
+
+    return LinearRegression()
+
+
+TYPE_SAMPLES: dict[str, Callable[[], object]] = {
+    "catboost.model": _catboost_sample,
+    "keras.model": _keras_sample,
+    "optuna.Study": _optuna_sample,
+    "plotly.graph_objects.Figure": _plotly_sample,
+    "polars.DataFrame": _polars_sample,
+    "torch.Tensor": _torch_sample,
+    "sklearn.base.BaseEstimator": _sklearn_sample,
+}
+
+
+def _parse_previous(path: Path) -> dict[str, bool]:
+    """Parse previous table to detect regressions."""
+    if not path.exists():
+        return {}
+    previous: dict[str, bool] = {}
+    for line in path.read_text(encoding="utf-8").splitlines():
+        if not line.startswith("|") or line.startswith("| Type"):
+            continue
+        parts = [p.strip() for p in line.strip().strip("|").split("|")]
+        if len(parts) < 4:
+            continue
+        status = parts[1]
+        previous[parts[0]] = status in {"✅", "Yes", "yes", "true", "True"}
+    return previous
+
+
+def generate_supported_types_table(doc_path: Path | None = None, fail_on_regression: bool = True) -> list[str]:
+    """Generate the supported types table.
+
+    Args:
+        doc_path: Optional path to output markdown file.
+        fail_on_regression: If True, raise error when a previously passing type now fails.
+
+    Returns:
+        List of regression type names (empty if none).
+    """
+    repo_root = Path(__file__).resolve().parent.parent
+    if doc_path is None:
+        doc_path = repo_root / "docs" / "supported-types.md"
+
+    registry = get_type_registry()
+    type_names = sorted(registry.get_registered_types())
+    previous = _parse_previous(doc_path)
+
+    lines: list[str] = []
+    regressions: list[str] = []
+
+    header = (
+        "# Supported Types\n\n"
+        "This file is auto-generated by `tools/gen_supported_types.py`.\n\n"
+        "| Type | Round-trip? | Notes/Caveats | Test ID |\n"
+        "| --- | --- | --- | --- |\n"
+    )
+
+    for idx, type_name in enumerate(type_names, 1):
+        handler = registry.find_handler_by_type_name(type_name)
+        factory = TYPE_SAMPLES.get(type_name)
+        note = ""
+        success: bool | None
+        if factory is None:
+            success = None
+            note = "No sample available"
+        else:
+            try:
+                obj = factory()
+            except Exception as exc:  # ImportError or others
+                success = None
+                note = f"Dependency not installed: {exc.__class__.__name__}"
+            else:
+                try:
+                    serialized = handler.serialize(obj) if handler else None
+                    deserialized = handler.deserialize(serialized) if handler else None
+                    success = handler is not None and isinstance(deserialized, obj.__class__)
+                    if not success:
+                        note = f"Deserialized as {type(deserialized).__name__ if deserialized is not None else 'None'}"
+                except Exception as exc:  # pragma: no cover - unexpected
+                    success = False
+                    note = str(exc)
+        status = "✅" if success else ("❌" if success is False else "⚠️")
+        test_id = f"T{idx:03d}"
+        lines.append(f"| {type_name} | {status} | {note} | {test_id} |")
+        # Only treat as regression on explicit failure. Missing deps (None)
+        # are not counted as regressions to avoid CI false positives.
+        if previous.get(type_name) and success is False:
+            regressions.append(type_name)
+
+    doc_path.write_text(header + "\n".join(lines) + "\n", encoding="utf-8")
+
+    if fail_on_regression and regressions:
+        raise RuntimeError("Round-trip regression for: " + ", ".join(regressions))
+    return regressions
+
+
+def main() -> None:
+    """CLI entrypoint."""
+    generate_supported_types_table()
+
+
+if __name__ == "__main__":  # pragma: no cover
+    main()
@@ -0,0 +1,16 @@
+"""MkDocs hooks for datason documentation build."""
+
+import sys
+from pathlib import Path
+
+# Ensure repository root is on path when MkDocs loads this file
+REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from tools.gen_supported_types import generate_supported_types_table  # noqa: E402
+
+
+def on_pre_build(config):
+    """Generate supported types table before building docs."""
+    generate_supported_types_table()