Skip to content
12 changes: 12 additions & 0 deletions .github/codeql/codeql-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: "CodeQL configuration for datason"

# Restrict analysis to Python until Rust code is added.
languages:
- python

# Optionally tune paths; keep default behavior otherwise.
paths-ignore:
- docs/
- examples/
- tests/
- scripts/
5 changes: 5 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,11 @@ jobs:
export PYTHONDONTWRITEBYTECODE=1
export PYTHONOVERSAFE=1

# Only run Supported Types Matrix test in ML/full environments
if [ "${{ matrix.dependency-set.use-ml }}" = "true" ] || [ "${{ matrix.dependency-set.use-dev }}" = "true" ]; then
export RUN_SUPPORTED_TYPES=1
fi

pytest ${{ matrix.dependency-set.test-pattern }} -v \
--cov=datason \
--cov-report=xml:coverage-${{ matrix.dependency-set.name }}.xml \
Expand Down
21 changes: 14 additions & 7 deletions datason/core_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from datetime import datetime
from decimal import Decimal
from pathlib import Path
from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Set, Union
from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Set, Tuple, Union

try:
import pandas as pd
Expand Down Expand Up @@ -74,7 +74,9 @@ def normalize_numpy_types(obj: Any) -> Any: # Match exact signature from type_h
_UUID_CACHE_SIZE_LIMIT = 100 # Small cache for common UUIDs

# OPTIMIZATION: Collection processing cache for bulk operations
_COLLECTION_COMPATIBILITY_CACHE: Dict[int, str] = {} # Maps collection id to compatibility status
# Cache homogeneity results per collection identity and size to avoid stale results
# when a collection is mutated (e.g., empty -> single item) but retains identity.
_COLLECTION_COMPATIBILITY_CACHE: Dict[Tuple[int, int], str] = {}
_COLLECTION_CACHE_SIZE_LIMIT = 200 # Smaller cache for collections

# OPTIMIZATION: Memory allocation optimization - Phase 1 Step 1.4
Expand Down Expand Up @@ -2102,6 +2104,7 @@ def _is_homogeneous_collection(

# SECURITY: Check for circular references
obj_id = id(obj)
obj_len = len(obj) if isinstance(obj, (list, tuple, dict)) else 0
if obj_id in _seen_ids:
# Circular reference detected, assume mixed to force full processing
return "mixed"
Expand All @@ -2111,8 +2114,9 @@ def _is_homogeneous_collection(

try:
# OPTIMIZATION: Check cache first for collections we've seen before
if obj_id in _COLLECTION_COMPATIBILITY_CACHE:
return _COLLECTION_COMPATIBILITY_CACHE[obj_id]
cache_key = (obj_id, obj_len)
if cache_key in _COLLECTION_COMPATIBILITY_CACHE:
return _COLLECTION_COMPATIBILITY_CACHE[cache_key]

homogeneity_result = None

Expand All @@ -2128,7 +2132,9 @@ def _is_homogeneous_collection(
homogeneity_result = "json_basic"
elif all(_is_json_basic_type_safe(v, _seen_ids, _max_check_depth - 1) for v in sample):
# Check if all values are JSON-basic types
homogeneity_result = "json_basic"
# For single-item collections, return 'single_type' to avoid overconfident
# classification as 'json_basic' with insufficient sample size.
homogeneity_result = "single_type" if len(sample) == 1 else "json_basic"
else:
# Check if all values are the same type
first_type = type(sample[0])
Expand All @@ -2143,7 +2149,8 @@ def _is_homogeneous_collection(

if all(_is_json_basic_type_safe(item, _seen_ids, _max_check_depth - 1) for item in sample_items):
# Check if all items are JSON-basic types
homogeneity_result = "json_basic"
# For single-item collections, prefer 'single_type' to avoid overclassification
homogeneity_result = "single_type" if len(sample_items) == 1 else "json_basic"
else:
# Check if all items are the same type
first_type = type(sample_items[0])
Expand All @@ -2153,7 +2160,7 @@ def _is_homogeneous_collection(

# Cache the result if we have space
if homogeneity_result is not None and len(_COLLECTION_COMPATIBILITY_CACHE) < _COLLECTION_CACHE_SIZE_LIMIT:
_COLLECTION_COMPATIBILITY_CACHE[obj_id] = homogeneity_result
_COLLECTION_COMPATIBILITY_CACHE[cache_key] = homogeneity_result

return homogeneity_result

Expand Down
13 changes: 13 additions & 0 deletions docs/supported-types.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Supported Types

This file is auto-generated by `tools/gen_supported_types.py`.

| Type | Round-trip? | Notes/Caveats | Test ID |
| --- | --- | --- | --- |
| catboost.model | ✅ | | T001 |
| keras.model | ✅ | | T002 |
| optuna.Study | ✅ | | T003 |
| plotly.graph_objects.Figure | ✅ | | T004 |
| polars.DataFrame | ✅ | | T005 |
| sklearn.base.BaseEstimator | ✅ | | T006 |
| torch.Tensor | ✅ | | T007 |
5 changes: 5 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ plugins:
merge_init_into_class: true
show_submodules: true

hooks:
- tools/mkdocs_hooks.py

# Extensions
markdown_extensions:
- abbr
Expand Down Expand Up @@ -124,6 +127,7 @@ markdown_extensions:
# Fixed navigation using files that actually exist
nav:
- Home: index.md
- Supported Types: supported-types.md

# User Guide Section
- User Guide:
Expand Down Expand Up @@ -195,6 +199,7 @@ nav:
# Reference Documentation
- Reference:
- Feature Matrix: FEATURE_MATRIX.md
- Supported Types: supported-types.md
- AI Usage Guide: AI_USAGE_GUIDE.md

# Community & Development Section
Expand Down
23 changes: 23 additions & 0 deletions tests/unit/test_supported_types_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Tests for supported types matrix generation."""

import os
import sys
from pathlib import Path

import pytest

sys.path.insert(0, str(Path(__file__).resolve().parents[2]))

from tools.gen_supported_types import generate_supported_types_table


@pytest.mark.skipif(
not os.environ.get("RUN_SUPPORTED_TYPES"),
reason="Run only in full/ML CI environments to avoid noise",
)
def test_generate_supported_types_table(tmp_path):
"""Table generation runs without regressions and creates file."""
doc_path = Path("docs/supported-types.md")
# Ensure function runs and writes the file; will raise on regression
generate_supported_types_table(doc_path=doc_path)
assert doc_path.exists()
1 change: 1 addition & 0 deletions tools/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Utility scripts for datason repository."""
164 changes: 164 additions & 0 deletions tools/gen_supported_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#!/usr/bin/env python3
"""Generate supported types matrix by testing registered handlers."""

from __future__ import annotations

import sys
from pathlib import Path
from typing import Callable

# Ensure repository root is on the Python path for local execution
REPO_ROOT = Path(__file__).resolve().parent.parent
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))

import datason # noqa: F401,E402 - ensure handlers are registered
from datason.type_registry import get_type_registry # noqa: E402

# Map registry type names to factory functions that create sample objects.
# Each factory may raise ImportError if dependency missing.


def _catboost_sample():
import catboost # type: ignore

return catboost.CatBoostClassifier(iterations=1, depth=1, verbose=False)


def _keras_sample():
import keras # type: ignore

return keras.Sequential()


def _optuna_sample():
import optuna # type: ignore

return optuna.create_study(direction="minimize")


def _plotly_sample():
import plotly.graph_objects as go # type: ignore

return go.Figure()


def _polars_sample():
import polars as pl # type: ignore

return pl.DataFrame({"a": [1, 2]})


def _torch_sample():
import torch # type: ignore

return torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)


def _sklearn_sample():
from sklearn.linear_model import LinearRegression # type: ignore

return LinearRegression()


TYPE_SAMPLES: dict[str, Callable[[], object]] = {
"catboost.model": _catboost_sample,
"keras.model": _keras_sample,
"optuna.Study": _optuna_sample,
"plotly.graph_objects.Figure": _plotly_sample,
"polars.DataFrame": _polars_sample,
"torch.Tensor": _torch_sample,
"sklearn.base.BaseEstimator": _sklearn_sample,
}


def _parse_previous(path: Path) -> dict[str, bool]:
"""Parse previous table to detect regressions."""
if not path.exists():
return {}
previous: dict[str, bool] = {}
for line in path.read_text(encoding="utf-8").splitlines():
if not line.startswith("|") or line.startswith("| Type"):
continue
parts = [p.strip() for p in line.strip().strip("|").split("|")]
if len(parts) < 4:
continue
status = parts[1]
previous[parts[0]] = status in {"✅", "Yes", "yes", "true", "True"}
return previous


def generate_supported_types_table(doc_path: Path | None = None, fail_on_regression: bool = True) -> list[str]:
"""Generate the supported types table.

Args:
doc_path: Optional path to output markdown file.
fail_on_regression: If True, raise error when a previously passing type now fails.

Returns:
List of regression type names (empty if none).
"""
repo_root = Path(__file__).resolve().parent.parent
if doc_path is None:
doc_path = repo_root / "docs" / "supported-types.md"

registry = get_type_registry()
type_names = sorted(registry.get_registered_types())
previous = _parse_previous(doc_path)

lines: list[str] = []
regressions: list[str] = []

header = (
"# Supported Types\n\n"
"This file is auto-generated by `tools/gen_supported_types.py`.\n\n"
"| Type | Round-trip? | Notes/Caveats | Test ID |\n"
"| --- | --- | --- | --- |\n"
)

for idx, type_name in enumerate(type_names, 1):
handler = registry.find_handler_by_type_name(type_name)
factory = TYPE_SAMPLES.get(type_name)
note = ""
success: bool | None
if factory is None:
success = None
note = "No sample available"
else:
try:
obj = factory()
except Exception as exc: # ImportError or others
success = None
note = f"Dependency not installed: {exc.__class__.__name__}"
else:
try:
serialized = handler.serialize(obj) if handler else None
deserialized = handler.deserialize(serialized) if handler else None
success = handler is not None and isinstance(deserialized, obj.__class__)
if not success:
note = f"Deserialized as {type(deserialized).__name__ if deserialized is not None else 'None'}"
except Exception as exc: # pragma: no cover - unexpected
success = False
note = str(exc)
status = "✅" if success else ("❌" if success is False else "⚠️")
test_id = f"T{idx:03d}"
lines.append(f"| {type_name} | {status} | {note} | {test_id} |")
# Only treat as regression on explicit failure. Missing deps (None)
# are not counted as regressions to avoid CI false positives.
if previous.get(type_name) and success is False:
regressions.append(type_name)

doc_path.write_text(header + "\n".join(lines) + "\n", encoding="utf-8")

if fail_on_regression and regressions:
raise RuntimeError("Round-trip regression for: " + ", ".join(regressions))
return regressions


def main() -> None:
"""CLI entrypoint."""
generate_supported_types_table()


if __name__ == "__main__": # pragma: no cover
main()
16 changes: 16 additions & 0 deletions tools/mkdocs_hooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""MkDocs hooks for datason documentation build."""

import sys
from pathlib import Path

# Ensure repository root is on path when MkDocs loads this file
REPO_ROOT = Path(__file__).resolve().parent.parent
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))

from tools.gen_supported_types import generate_supported_types_table # noqa: E402


def on_pre_build(config):
"""Generate supported types table before building docs."""
generate_supported_types_table()