[18362773160] Introduce EXPERIMENTAL_POLARS output format (#2769)

IvoDD · web-flow · commit 8e6763a1356f · 2025-11-24T17:56:27.000+02:00
#### Reference Issues/PRs Monday ref: 18362773160 #### What does this implement or fix? Polars output format is just a thin wrapper around the arrow output format. We create the polars dataframe zero-copy from the pyarrow table. Also improves some docs. Adds just a few extra tests for polars because: - Extensive arrow testing covers most arrow related logic - Parametrizing many tests to work with polars is difficult because `polars.DataFrame` does not have any concept of pandas metadata. #### Any other comments? I decided to go through `pyarrow` even though we could avoid the pyarrow dependency by using a [PyCapsule](https://docs.pola.rs/user-guide/misc/arrow/#using-polars-directly), because: - This would require rewriting our arrow denormalization (which currently relies on pyarrow APIs) - This would require extra testing coverage of polars output format. And it is harder to parametrize our existing tests because polars doesn't have a concept a pandas metadata. Also needed to clean up some space for conda build. This is done in the conda workflow. A successful run with workflow from this branch can be seen [here](https://github.com/man-group/ArcticDB/actions/runs/19636884795/job/56229690182). #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>
diff --git a/.github/workflows/build_with_conda.yml b/.github/workflows/build_with_conda.yml
@@ -65,6 +65,13 @@ jobs:
       mongodb:
         image: mongo:4.4
     steps:
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@v1.3.1
+        with:
+          tool-cache: false
+          large-packages: false # Time-consuming but doesn't save that much space (4GB)
+          docker-images: false  # We're using docker images we don't want to clear
+
       - uses: actions/checkout@v3.3.0
         # DONT use recursive submodules checkout to simulate conda feedstock build
         # with:
diff --git a/environment-dev.yml b/environment-dev.yml
@@ -79,6 +79,7 @@ dependencies:
   - pytest-timeout
   - azure-storage-blob
   - azure-identity
+  - polars
   - pyarrow
   - asv
   - pymongo
diff --git a/python/arcticdb/dependencies.py b/python/arcticdb/dependencies.py
@@ -31,9 +31,7 @@ def _import_optional_dependency(module_name: str) -> Tuple[ModuleType, bool]:
 
 
 pyarrow, _PYARROW_AVAILABLE = _import_optional_dependency("pyarrow")
+polars, _POLARS_AVAILABLE = _import_optional_dependency("polars")
 
 
-__all__ = [
-    "pyarrow",
-    "_PYARROW_AVAILABLE",
-]
+__all__ = ["pyarrow", "_PYARROW_AVAILABLE", "polars", "_POLARS_AVAILABLE"]
diff --git a/python/arcticdb/options.py b/python/arcticdb/options.py
@@ -9,7 +9,7 @@
 from typing import Optional, Union
 from enum import Enum
 
-from arcticdb.dependencies import _PYARROW_AVAILABLE
+from arcticdb.dependencies import _PYARROW_AVAILABLE, _POLARS_AVAILABLE
 from arcticdb.dependencies import pyarrow as pa
 from arcticdb.encoding_version import EncodingVersion
 from arcticdb_ext.storage import ModifiableLibraryOption, ModifiableEnterpriseLibraryOption
@@ -171,6 +171,7 @@ def __repr__(self):
 class OutputFormat(str, Enum):
     PANDAS = "PANDAS"
     EXPERIMENTAL_ARROW = "EXPERIMENTAL_ARROW"
+    EXPERIMENTAL_POLARS = "EXPERIMENTAL_POLARS"
 
 
 def output_format_to_internal(output_format: Union[OutputFormat, str]) -> InternalOutputFormat:
@@ -182,6 +183,12 @@ def output_format_to_internal(output_format: Union[OutputFormat, str]) -> Intern
                 "ArcticDB's pyarrow optional dependency missing but is required to use arrow output format."
             )
         return InternalOutputFormat.ARROW
+    elif output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower():
+        if not _PYARROW_AVAILABLE or not _POLARS_AVAILABLE:
+            raise ModuleNotFoundError(
+                "ArcticDB's pyarrow or polars optional dependencies are missing but are required to use polars output format."
+            )
+        return InternalOutputFormat.ARROW
     else:
         raise ValueError(f"Unknown OutputFormat: {output_format}")
 
diff --git a/python/arcticdb/util/test.py b/python/arcticdb/util/test.py
@@ -22,6 +22,7 @@
 import attr
 from functools import wraps, reduce
 from arcticdb.dependencies import pyarrow as pa
+from arcticdb.dependencies import polars as pl
 
 from arcticdb.util.marks import SHORTER_LOGS
 
@@ -252,6 +253,14 @@ def assert_frame_equal_with_arrow(left, right, **kwargs):
         left = convert_arrow_to_pandas_for_tests(left)
     if isinstance(right, pa.Table):
         right = convert_arrow_to_pandas_for_tests(right)
+
+    # Polars does not have a concept of pandas metadata, so this assertion would fail on any pandas dataframe, which
+    # has indices or any pandas specific column names (e.g. integers, duplicated column names, etc.)
+    if isinstance(left, pl.DataFrame):
+        left = left.to_pandas()
+    if isinstance(right, pl.DataFrame):
+        right = right.to_pandas()
+
     assert_frame_equal(left, right, **kwargs)
 
 
diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py
diff --git a/python/tests/conftest.py b/python/tests/conftest.py
@@ -1106,6 +1106,9 @@ def lmdb_version_store_arrow(lmdb_version_store_v1) -> NativeVersionStore:
     return store
 
 
+# Explicitly not including `OutputFormat.EXPERIMENTAL_POLARS` as `polars.to_pandas()` is not index aware, so all
+# `assert_frame_equal_with_arrow` would not work. Also POLARS is just a thin wrapper on top of pyarrow, so testing
+# just one is sufficent.
 @pytest.fixture(
     params=[OutputFormat.PANDAS, pytest.param(OutputFormat.EXPERIMENTAL_ARROW, marks=PYARROW_POST_PROCESSING)]
 )
diff --git a/python/tests/unit/arcticdb/test_arrow_api.py b/python/tests/unit/arcticdb/test_arrow_api.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+import polars as pl
 import pytest
 
 from arcticdb import LazyDataFrame, DataError, concat
@@ -18,15 +19,29 @@
     OutputFormat.EXPERIMENTAL_ARROW,
     "EXPERIMENTAL_ARROW",
     "experimental_arrow",
+    OutputFormat.EXPERIMENTAL_POLARS,
+    "EXPERIMENTAL_POLARS",
+    "experimental_polars",
+]
+no_str_output_format_args = [
+    None,
+    OutputFormat.PANDAS,
+    OutputFormat.EXPERIMENTAL_ARROW,
+    OutputFormat.EXPERIMENTAL_POLARS,
 ]
-no_str_output_format_args = [None, OutputFormat.PANDAS, OutputFormat.EXPERIMENTAL_ARROW]
 
 
 def expected_output_type(arctic_output_format, library_output_format, output_format_override):
     expected_output_format = (
         output_format_override or library_output_format or arctic_output_format or OutputFormat.PANDAS
     )
-    return pa.Table if expected_output_format.lower() == OutputFormat.EXPERIMENTAL_ARROW.lower() else pd.DataFrame
+    if expected_output_format.lower() == OutputFormat.PANDAS.lower():
+        return pd.DataFrame
+    if expected_output_format.lower() == OutputFormat.EXPERIMENTAL_ARROW.lower():
+        return pa.Table
+    if expected_output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower():
+        return pl.DataFrame
+    raise ValueError("Unexpected format")
 
 
 @pytest.mark.parametrize("arctic_output_format", no_str_output_format_args)
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow_read.py b/python/tests/unit/arcticdb/version_store/test_arrow_read.py
@@ -20,6 +20,7 @@
     dataframe_strategy,
     column_strategy,
 )
+import polars as pl
 from arcticdb.util.test import get_sample_dataframe, make_dynamic
 from arcticdb.util._versions import IS_PANDAS_ONE
 from arcticdb_ext.storage import KeyType
@@ -1156,3 +1157,28 @@ def test_arrow_read_batch_with_strings(lmdb_version_store_arrow):
     assert table_2.schema.field(0).type == pa.string()  # global default for all symbols
     assert table_2.schema.field(1).type == pa.dictionary(pa.int32(), pa.large_string())  # per_column override
     assert_frame_equal_with_arrow(table_2, df_2)
+
+
+def test_polars_basic(lmdb_version_store_arrow):
+    lib = lmdb_version_store_arrow
+    lib.set_output_format(OutputFormat.EXPERIMENTAL_POLARS)
+    sym = "polars"
+    df = pd.DataFrame(
+        {
+            "col_int": np.arange(10, dtype=np.int64),
+            "col_float": np.arange(10, dtype=np.float32),
+            "col_bool": [i % 2 == 0 for i in range(10)],
+            "col_str": ["x" * (i + 1) for i in range(10)],
+            "col_cat": [f"str_{i%3}" for i in range(10)],
+        },
+        index=pd.date_range(pd.Timestamp(2025, 1, 1), periods=10),
+    )
+    lib.write(sym, df)
+    result = lib.read(sym, arrow_string_format_per_column={"col_cat": ArrowOutputStringFormat.CATEGORICAL}).data
+    df.index.name = "__index__"
+    expected = pl.from_pandas(df.reset_index())
+    expected = expected.with_columns(pl.col("col_cat").cast(pl.Categorical))
+    expected.columns[0] = "__index__"
+    assert result.columns == ["__index__", "col_int", "col_float", "col_bool", "col_str", "col_cat"]
+    assert result.dtypes == [pl.Datetime("ns"), pl.Int64, pl.Float32, pl.Boolean, pl.String, pl.Categorical]
+    assert result.equals(expected)
diff --git a/setup.cfg b/setup.cfg
@@ -109,10 +109,12 @@ exclude=
 # If any pins are required on the optional dependencies they also need explicit checks for e.g. `pyarrow.__version__`
 # in `dependencies.py`
 arrow =
+    polars
     pyarrow
 
 Testing =
     pytest
+    polars
     pyarrow
     pytest-cpp
     pytest-timeout

Original file line number	Diff line number	Diff line change
`@@ -1106,6 +1106,9 @@ def lmdb_version_store_arrow(lmdb_version_store_v1) -> NativeVersionStore:`
`1106`	`1106`	`return store`
`1107`	`1107`
`1108`	`1108`
	`1109`	+# Explicitly not including `OutputFormat.EXPERIMENTAL_POLARS` as `polars.to_pandas()` is not index aware, so all
	`1110`	+# `assert_frame_equal_with_arrow` would not work. Also POLARS is just a thin wrapper on top of pyarrow, so testing
	`1111`	`+# just one is sufficent.`
`1109`	`1112`	`@pytest.fixture(`
`1110`	`1113`	`params=[OutputFormat.PANDAS, pytest.param(OutputFormat.EXPERIMENTAL_ARROW, marks=PYARROW_POST_PROCESSING)]`
`1111`	`1114`	`)`