diff --git a/.github/codeql/codeql-config.yml b/.github/codeql/codeql-config.yml new file mode 100644 index 0000000..761e383 --- /dev/null +++ b/.github/codeql/codeql-config.yml @@ -0,0 +1,12 @@ +name: "CodeQL configuration for datason" + +# Restrict analysis to Python until Rust code is added. +languages: + - python + +# Optionally tune paths; keep default behavior otherwise. +paths-ignore: + - docs/ + - examples/ + - tests/ + - scripts/ diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4d5bc2a..8c9eefd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -291,6 +291,11 @@ jobs: export PYTHONDONTWRITEBYTECODE=1 export PYTHONOVERSAFE=1 + # Only run Supported Types Matrix test in ML/full environments + if [ "${{ matrix.dependency-set.use-ml }}" = "true" ] || [ "${{ matrix.dependency-set.use-dev }}" = "true" ]; then + export RUN_SUPPORTED_TYPES=1 + fi + pytest ${{ matrix.dependency-set.test-pattern }} -v \ --cov=datason \ --cov-report=xml:coverage-${{ matrix.dependency-set.name }}.xml \ diff --git a/datason/core_new.py b/datason/core_new.py index e370ad5..d986ecc 100644 --- a/datason/core_new.py +++ b/datason/core_new.py @@ -10,7 +10,7 @@ from datetime import datetime from decimal import Decimal from pathlib import Path -from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Set, Union +from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Set, Tuple, Union try: import pandas as pd @@ -74,7 +74,9 @@ def normalize_numpy_types(obj: Any) -> Any: # Match exact signature from type_h _UUID_CACHE_SIZE_LIMIT = 100 # Small cache for common UUIDs # OPTIMIZATION: Collection processing cache for bulk operations -_COLLECTION_COMPATIBILITY_CACHE: Dict[int, str] = {} # Maps collection id to compatibility status +# Cache homogeneity results per collection identity and size to avoid stale results +# when a collection is mutated (e.g., empty -> single item) but retains identity. +_COLLECTION_COMPATIBILITY_CACHE: Dict[Tuple[int, int], str] = {} _COLLECTION_CACHE_SIZE_LIMIT = 200 # Smaller cache for collections # OPTIMIZATION: Memory allocation optimization - Phase 1 Step 1.4 @@ -2102,6 +2104,7 @@ def _is_homogeneous_collection( # SECURITY: Check for circular references obj_id = id(obj) + obj_len = len(obj) if isinstance(obj, (list, tuple, dict)) else 0 if obj_id in _seen_ids: # Circular reference detected, assume mixed to force full processing return "mixed" @@ -2111,8 +2114,9 @@ def _is_homogeneous_collection( try: # OPTIMIZATION: Check cache first for collections we've seen before - if obj_id in _COLLECTION_COMPATIBILITY_CACHE: - return _COLLECTION_COMPATIBILITY_CACHE[obj_id] + cache_key = (obj_id, obj_len) + if cache_key in _COLLECTION_COMPATIBILITY_CACHE: + return _COLLECTION_COMPATIBILITY_CACHE[cache_key] homogeneity_result = None @@ -2128,7 +2132,9 @@ def _is_homogeneous_collection( homogeneity_result = "json_basic" elif all(_is_json_basic_type_safe(v, _seen_ids, _max_check_depth - 1) for v in sample): # Check if all values are JSON-basic types - homogeneity_result = "json_basic" + # For single-item collections, return 'single_type' to avoid overconfident + # classification as 'json_basic' with insufficient sample size. + homogeneity_result = "single_type" if len(sample) == 1 else "json_basic" else: # Check if all values are the same type first_type = type(sample[0]) @@ -2143,7 +2149,8 @@ def _is_homogeneous_collection( if all(_is_json_basic_type_safe(item, _seen_ids, _max_check_depth - 1) for item in sample_items): # Check if all items are JSON-basic types - homogeneity_result = "json_basic" + # For single-item collections, prefer 'single_type' to avoid overclassification + homogeneity_result = "single_type" if len(sample_items) == 1 else "json_basic" else: # Check if all items are the same type first_type = type(sample_items[0]) @@ -2153,7 +2160,7 @@ def _is_homogeneous_collection( # Cache the result if we have space if homogeneity_result is not None and len(_COLLECTION_COMPATIBILITY_CACHE) < _COLLECTION_CACHE_SIZE_LIMIT: - _COLLECTION_COMPATIBILITY_CACHE[obj_id] = homogeneity_result + _COLLECTION_COMPATIBILITY_CACHE[cache_key] = homogeneity_result return homogeneity_result diff --git a/docs/supported-types.md b/docs/supported-types.md new file mode 100644 index 0000000..982b645 --- /dev/null +++ b/docs/supported-types.md @@ -0,0 +1,13 @@ +# Supported Types + +This file is auto-generated by `tools/gen_supported_types.py`. + +| Type | Round-trip? | Notes/Caveats | Test ID | +| --- | --- | --- | --- | +| catboost.model | ✅ | | T001 | +| keras.model | ✅ | | T002 | +| optuna.Study | ✅ | | T003 | +| plotly.graph_objects.Figure | ✅ | | T004 | +| polars.DataFrame | ✅ | | T005 | +| sklearn.base.BaseEstimator | ✅ | | T006 | +| torch.Tensor | ✅ | | T007 | diff --git a/mkdocs.yml b/mkdocs.yml index a3e5a1c..60165aa 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -80,6 +80,9 @@ plugins: merge_init_into_class: true show_submodules: true +hooks: + - tools/mkdocs_hooks.py + # Extensions markdown_extensions: - abbr @@ -124,6 +127,7 @@ markdown_extensions: # Fixed navigation using files that actually exist nav: - Home: index.md + - Supported Types: supported-types.md # User Guide Section - User Guide: @@ -195,6 +199,7 @@ nav: # Reference Documentation - Reference: - Feature Matrix: FEATURE_MATRIX.md + - Supported Types: supported-types.md - AI Usage Guide: AI_USAGE_GUIDE.md # Community & Development Section diff --git a/tests/unit/test_supported_types_matrix.py b/tests/unit/test_supported_types_matrix.py new file mode 100644 index 0000000..9302cd6 --- /dev/null +++ b/tests/unit/test_supported_types_matrix.py @@ -0,0 +1,23 @@ +"""Tests for supported types matrix generation.""" + +import os +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parents[2])) + +from tools.gen_supported_types import generate_supported_types_table + + +@pytest.mark.skipif( + not os.environ.get("RUN_SUPPORTED_TYPES"), + reason="Run only in full/ML CI environments to avoid noise", +) +def test_generate_supported_types_table(tmp_path): + """Table generation runs without regressions and creates file.""" + doc_path = Path("docs/supported-types.md") + # Ensure function runs and writes the file; will raise on regression + generate_supported_types_table(doc_path=doc_path) + assert doc_path.exists() diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000..f52da88 --- /dev/null +++ b/tools/__init__.py @@ -0,0 +1 @@ +"""Utility scripts for datason repository.""" diff --git a/tools/gen_supported_types.py b/tools/gen_supported_types.py new file mode 100644 index 0000000..bd93b1c --- /dev/null +++ b/tools/gen_supported_types.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +"""Generate supported types matrix by testing registered handlers.""" + +from __future__ import annotations + +import sys +from pathlib import Path +from typing import Callable + +# Ensure repository root is on the Python path for local execution +REPO_ROOT = Path(__file__).resolve().parent.parent +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +import datason # noqa: F401,E402 - ensure handlers are registered +from datason.type_registry import get_type_registry # noqa: E402 + +# Map registry type names to factory functions that create sample objects. +# Each factory may raise ImportError if dependency missing. + + +def _catboost_sample(): + import catboost # type: ignore + + return catboost.CatBoostClassifier(iterations=1, depth=1, verbose=False) + + +def _keras_sample(): + import keras # type: ignore + + return keras.Sequential() + + +def _optuna_sample(): + import optuna # type: ignore + + return optuna.create_study(direction="minimize") + + +def _plotly_sample(): + import plotly.graph_objects as go # type: ignore + + return go.Figure() + + +def _polars_sample(): + import polars as pl # type: ignore + + return pl.DataFrame({"a": [1, 2]}) + + +def _torch_sample(): + import torch # type: ignore + + return torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True) + + +def _sklearn_sample(): + from sklearn.linear_model import LinearRegression # type: ignore + + return LinearRegression() + + +TYPE_SAMPLES: dict[str, Callable[[], object]] = { + "catboost.model": _catboost_sample, + "keras.model": _keras_sample, + "optuna.Study": _optuna_sample, + "plotly.graph_objects.Figure": _plotly_sample, + "polars.DataFrame": _polars_sample, + "torch.Tensor": _torch_sample, + "sklearn.base.BaseEstimator": _sklearn_sample, +} + + +def _parse_previous(path: Path) -> dict[str, bool]: + """Parse previous table to detect regressions.""" + if not path.exists(): + return {} + previous: dict[str, bool] = {} + for line in path.read_text(encoding="utf-8").splitlines(): + if not line.startswith("|") or line.startswith("| Type"): + continue + parts = [p.strip() for p in line.strip().strip("|").split("|")] + if len(parts) < 4: + continue + status = parts[1] + previous[parts[0]] = status in {"✅", "Yes", "yes", "true", "True"} + return previous + + +def generate_supported_types_table(doc_path: Path | None = None, fail_on_regression: bool = True) -> list[str]: + """Generate the supported types table. + + Args: + doc_path: Optional path to output markdown file. + fail_on_regression: If True, raise error when a previously passing type now fails. + + Returns: + List of regression type names (empty if none). + """ + repo_root = Path(__file__).resolve().parent.parent + if doc_path is None: + doc_path = repo_root / "docs" / "supported-types.md" + + registry = get_type_registry() + type_names = sorted(registry.get_registered_types()) + previous = _parse_previous(doc_path) + + lines: list[str] = [] + regressions: list[str] = [] + + header = ( + "# Supported Types\n\n" + "This file is auto-generated by `tools/gen_supported_types.py`.\n\n" + "| Type | Round-trip? | Notes/Caveats | Test ID |\n" + "| --- | --- | --- | --- |\n" + ) + + for idx, type_name in enumerate(type_names, 1): + handler = registry.find_handler_by_type_name(type_name) + factory = TYPE_SAMPLES.get(type_name) + note = "" + success: bool | None + if factory is None: + success = None + note = "No sample available" + else: + try: + obj = factory() + except Exception as exc: # ImportError or others + success = None + note = f"Dependency not installed: {exc.__class__.__name__}" + else: + try: + serialized = handler.serialize(obj) if handler else None + deserialized = handler.deserialize(serialized) if handler else None + success = handler is not None and isinstance(deserialized, obj.__class__) + if not success: + note = f"Deserialized as {type(deserialized).__name__ if deserialized is not None else 'None'}" + except Exception as exc: # pragma: no cover - unexpected + success = False + note = str(exc) + status = "✅" if success else ("❌" if success is False else "⚠️") + test_id = f"T{idx:03d}" + lines.append(f"| {type_name} | {status} | {note} | {test_id} |") + # Only treat as regression on explicit failure. Missing deps (None) + # are not counted as regressions to avoid CI false positives. + if previous.get(type_name) and success is False: + regressions.append(type_name) + + doc_path.write_text(header + "\n".join(lines) + "\n", encoding="utf-8") + + if fail_on_regression and regressions: + raise RuntimeError("Round-trip regression for: " + ", ".join(regressions)) + return regressions + + +def main() -> None: + """CLI entrypoint.""" + generate_supported_types_table() + + +if __name__ == "__main__": # pragma: no cover + main() diff --git a/tools/mkdocs_hooks.py b/tools/mkdocs_hooks.py new file mode 100644 index 0000000..05646d4 --- /dev/null +++ b/tools/mkdocs_hooks.py @@ -0,0 +1,16 @@ +"""MkDocs hooks for datason documentation build.""" + +import sys +from pathlib import Path + +# Ensure repository root is on path when MkDocs loads this file +REPO_ROOT = Path(__file__).resolve().parent.parent +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from tools.gen_supported_types import generate_supported_types_table # noqa: E402 + + +def on_pre_build(config): + """Generate supported types table before building docs.""" + generate_supported_types_table()