Skip to content

Commit 8e6763a

Browse files
authored
[18362773160] Introduce EXPERIMENTAL_POLARS output format (#2769)
#### Reference Issues/PRs Monday ref: 18362773160 #### What does this implement or fix? Polars output format is just a thin wrapper around the arrow output format. We create the polars dataframe zero-copy from the pyarrow table. Also improves some docs. Adds just a few extra tests for polars because: - Extensive arrow testing covers most arrow related logic - Parametrizing many tests to work with polars is difficult because `polars.DataFrame` does not have any concept of pandas metadata. #### Any other comments? I decided to go through `pyarrow` even though we could avoid the pyarrow dependency by using a [PyCapsule](https://docs.pola.rs/user-guide/misc/arrow/#using-polars-directly), because: - This would require rewriting our arrow denormalization (which currently relies on pyarrow APIs) - This would require extra testing coverage of polars output format. And it is harder to parametrize our existing tests because polars doesn't have a concept a pandas metadata. Also needed to clean up some space for conda build. This is done in the conda workflow. A successful run with workflow from this branch can be seen [here](https://github.com/man-group/ArcticDB/actions/runs/19636884795/job/56229690182). #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details> <!-- Thanks for contributing a Pull Request to ArcticDB! Please ensure you have taken a look at: - ArcticDB's Code of Conduct: https://github.com/man-group/ArcticDB/blob/master/CODE_OF_CONDUCT.md - ArcticDB's Contribution Licensing: https://github.com/man-group/ArcticDB/blob/master/docs/mkdocs/docs/technical/contributing.md#contribution-licensing -->
1 parent 588a901 commit 8e6763a

File tree

10 files changed

+132
-54
lines changed

10 files changed

+132
-54
lines changed

.github/workflows/build_with_conda.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,13 @@ jobs:
6565
mongodb:
6666
image: mongo:4.4
6767
steps:
68+
- name: Free Disk Space (Ubuntu)
69+
uses: jlumbroso/[email protected]
70+
with:
71+
tool-cache: false
72+
large-packages: false # Time-consuming but doesn't save that much space (4GB)
73+
docker-images: false # We're using docker images we don't want to clear
74+
6875
- uses: actions/[email protected]
6976
# DONT use recursive submodules checkout to simulate conda feedstock build
7077
# with:

environment-dev.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ dependencies:
7979
- pytest-timeout
8080
- azure-storage-blob
8181
- azure-identity
82+
- polars
8283
- pyarrow
8384
- asv
8485
- pymongo

python/arcticdb/dependencies.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,7 @@ def _import_optional_dependency(module_name: str) -> Tuple[ModuleType, bool]:
3131

3232

3333
pyarrow, _PYARROW_AVAILABLE = _import_optional_dependency("pyarrow")
34+
polars, _POLARS_AVAILABLE = _import_optional_dependency("polars")
3435

3536

36-
__all__ = [
37-
"pyarrow",
38-
"_PYARROW_AVAILABLE",
39-
]
37+
__all__ = ["pyarrow", "_PYARROW_AVAILABLE", "polars", "_POLARS_AVAILABLE"]

python/arcticdb/options.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from typing import Optional, Union
1010
from enum import Enum
1111

12-
from arcticdb.dependencies import _PYARROW_AVAILABLE
12+
from arcticdb.dependencies import _PYARROW_AVAILABLE, _POLARS_AVAILABLE
1313
from arcticdb.dependencies import pyarrow as pa
1414
from arcticdb.encoding_version import EncodingVersion
1515
from arcticdb_ext.storage import ModifiableLibraryOption, ModifiableEnterpriseLibraryOption
@@ -171,6 +171,7 @@ def __repr__(self):
171171
class OutputFormat(str, Enum):
172172
PANDAS = "PANDAS"
173173
EXPERIMENTAL_ARROW = "EXPERIMENTAL_ARROW"
174+
EXPERIMENTAL_POLARS = "EXPERIMENTAL_POLARS"
174175

175176

176177
def output_format_to_internal(output_format: Union[OutputFormat, str]) -> InternalOutputFormat:
@@ -182,6 +183,12 @@ def output_format_to_internal(output_format: Union[OutputFormat, str]) -> Intern
182183
"ArcticDB's pyarrow optional dependency missing but is required to use arrow output format."
183184
)
184185
return InternalOutputFormat.ARROW
186+
elif output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower():
187+
if not _PYARROW_AVAILABLE or not _POLARS_AVAILABLE:
188+
raise ModuleNotFoundError(
189+
"ArcticDB's pyarrow or polars optional dependencies are missing but are required to use polars output format."
190+
)
191+
return InternalOutputFormat.ARROW
185192
else:
186193
raise ValueError(f"Unknown OutputFormat: {output_format}")
187194

python/arcticdb/util/test.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import attr
2323
from functools import wraps, reduce
2424
from arcticdb.dependencies import pyarrow as pa
25+
from arcticdb.dependencies import polars as pl
2526

2627
from arcticdb.util.marks import SHORTER_LOGS
2728

@@ -252,6 +253,14 @@ def assert_frame_equal_with_arrow(left, right, **kwargs):
252253
left = convert_arrow_to_pandas_for_tests(left)
253254
if isinstance(right, pa.Table):
254255
right = convert_arrow_to_pandas_for_tests(right)
256+
257+
# Polars does not have a concept of pandas metadata, so this assertion would fail on any pandas dataframe, which
258+
# has indices or any pandas specific column names (e.g. integers, duplicated column names, etc.)
259+
if isinstance(left, pl.DataFrame):
260+
left = left.to_pandas()
261+
if isinstance(right, pl.DataFrame):
262+
right = right.to_pandas()
263+
255264
assert_frame_equal(left, right, **kwargs)
256265

257266

python/arcticdb/version_store/_store.py

Lines changed: 57 additions & 47 deletions
Large diffs are not rendered by default.

python/tests/conftest.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,9 @@ def lmdb_version_store_arrow(lmdb_version_store_v1) -> NativeVersionStore:
11061106
return store
11071107

11081108

1109+
# Explicitly not including `OutputFormat.EXPERIMENTAL_POLARS` as `polars.to_pandas()` is not index aware, so all
1110+
# `assert_frame_equal_with_arrow` would not work. Also POLARS is just a thin wrapper on top of pyarrow, so testing
1111+
# just one is sufficent.
11091112
@pytest.fixture(
11101113
params=[OutputFormat.PANDAS, pytest.param(OutputFormat.EXPERIMENTAL_ARROW, marks=PYARROW_POST_PROCESSING)]
11111114
)

python/tests/unit/arcticdb/test_arrow_api.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import numpy as np
22
import pandas as pd
33
import pyarrow as pa
4+
import polars as pl
45
import pytest
56

67
from arcticdb import LazyDataFrame, DataError, concat
@@ -18,15 +19,29 @@
1819
OutputFormat.EXPERIMENTAL_ARROW,
1920
"EXPERIMENTAL_ARROW",
2021
"experimental_arrow",
22+
OutputFormat.EXPERIMENTAL_POLARS,
23+
"EXPERIMENTAL_POLARS",
24+
"experimental_polars",
25+
]
26+
no_str_output_format_args = [
27+
None,
28+
OutputFormat.PANDAS,
29+
OutputFormat.EXPERIMENTAL_ARROW,
30+
OutputFormat.EXPERIMENTAL_POLARS,
2131
]
22-
no_str_output_format_args = [None, OutputFormat.PANDAS, OutputFormat.EXPERIMENTAL_ARROW]
2332

2433

2534
def expected_output_type(arctic_output_format, library_output_format, output_format_override):
2635
expected_output_format = (
2736
output_format_override or library_output_format or arctic_output_format or OutputFormat.PANDAS
2837
)
29-
return pa.Table if expected_output_format.lower() == OutputFormat.EXPERIMENTAL_ARROW.lower() else pd.DataFrame
38+
if expected_output_format.lower() == OutputFormat.PANDAS.lower():
39+
return pd.DataFrame
40+
if expected_output_format.lower() == OutputFormat.EXPERIMENTAL_ARROW.lower():
41+
return pa.Table
42+
if expected_output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower():
43+
return pl.DataFrame
44+
raise ValueError("Unexpected format")
3045

3146

3247
@pytest.mark.parametrize("arctic_output_format", no_str_output_format_args)

python/tests/unit/arcticdb/version_store/test_arrow_read.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
dataframe_strategy,
2121
column_strategy,
2222
)
23+
import polars as pl
2324
from arcticdb.util.test import get_sample_dataframe, make_dynamic
2425
from arcticdb.util._versions import IS_PANDAS_ONE
2526
from arcticdb_ext.storage import KeyType
@@ -1156,3 +1157,28 @@ def test_arrow_read_batch_with_strings(lmdb_version_store_arrow):
11561157
assert table_2.schema.field(0).type == pa.string() # global default for all symbols
11571158
assert table_2.schema.field(1).type == pa.dictionary(pa.int32(), pa.large_string()) # per_column override
11581159
assert_frame_equal_with_arrow(table_2, df_2)
1160+
1161+
1162+
def test_polars_basic(lmdb_version_store_arrow):
1163+
lib = lmdb_version_store_arrow
1164+
lib.set_output_format(OutputFormat.EXPERIMENTAL_POLARS)
1165+
sym = "polars"
1166+
df = pd.DataFrame(
1167+
{
1168+
"col_int": np.arange(10, dtype=np.int64),
1169+
"col_float": np.arange(10, dtype=np.float32),
1170+
"col_bool": [i % 2 == 0 for i in range(10)],
1171+
"col_str": ["x" * (i + 1) for i in range(10)],
1172+
"col_cat": [f"str_{i%3}" for i in range(10)],
1173+
},
1174+
index=pd.date_range(pd.Timestamp(2025, 1, 1), periods=10),
1175+
)
1176+
lib.write(sym, df)
1177+
result = lib.read(sym, arrow_string_format_per_column={"col_cat": ArrowOutputStringFormat.CATEGORICAL}).data
1178+
df.index.name = "__index__"
1179+
expected = pl.from_pandas(df.reset_index())
1180+
expected = expected.with_columns(pl.col("col_cat").cast(pl.Categorical))
1181+
expected.columns[0] = "__index__"
1182+
assert result.columns == ["__index__", "col_int", "col_float", "col_bool", "col_str", "col_cat"]
1183+
assert result.dtypes == [pl.Datetime("ns"), pl.Int64, pl.Float32, pl.Boolean, pl.String, pl.Categorical]
1184+
assert result.equals(expected)

setup.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,12 @@ exclude=
109109
# If any pins are required on the optional dependencies they also need explicit checks for e.g. `pyarrow.__version__`
110110
# in `dependencies.py`
111111
arrow =
112+
polars
112113
pyarrow
113114

114115
Testing =
115116
pytest
117+
polars
116118
pyarrow
117119
pytest-cpp
118120
pytest-timeout

0 commit comments

Comments
 (0)