earth-mover · dcherian · May 13, 2025 · Feb 27, 2025 · Mar 4, 2025 · Mar 6, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -34,6 +34,6 @@ repos:
         entry: just pre-commit
         language: system
         pass_filenames: false
-        files: "icechunk/"
+        files: "(^|.*/)(icechunk/|icechunk-python/src/).*"
 
 exclude: 'tests/data/.*'
diff --git a/Justfile b/Justfile
@@ -5,6 +5,9 @@ alias pre := pre-commit
 test *args='':
   cargo test --all --all-targets {{args}}
 
+doctest *args='':
+  cargo test --doc {{args}}
+
 # run all tests with logs enabled
 test-logs level *args='':
   RUST_LOG=icechunk={{level}} cargo test --all --all-targets {{args}} -- --nocapture
@@ -46,6 +49,7 @@ pre-commit $RUSTFLAGS="-D warnings -W unreachable-pub -W bare-trait-objects":
   just build
   just format "--check"
   just lint "-p icechunk -p icechunk-python"
+  just doctest
   just test
   just run-all-examples
   just check-deps

diff --git a/docs/.readthedocs.yaml b/docs/.readthedocs.yaml
@@ -6,11 +6,10 @@ build:
     python: "mambaforge-latest"
 
   jobs:
-     pre_create_environment:
-       - conda update --yes --quiet --name=base --channel=defaults conda
      install:
       - which mamba
-      - cd icechunk-python && maturin build && pip install ../target/wheels/*.whl && cd ../docs
+      - cd icechunk-python && maturin build && pip install "$(ls ../target/wheels/*.whl | head -n 1)[docs]" && cd ../docs
+      - mamba list
       # - cd icechunk-python && maturin develop && cd ../docs
 
 

diff --git a/docs/doc-env.yml b/docs/doc-env.yml
@@ -4,39 +4,8 @@ channels:
   - conda-forge
   - nodefaults
 dependencies:
-  - python>=3.10
-  - "sphinx"
+  - python>=3.12
   - pip
   - maturin
-  - xarray
-  - pooch
-  - scipy
-  - dask
-  - cftime
-  - distributed
   - cairo
   - rust
-  - maturin
-  - pip:
-    - "myst_nb"
-    - mkdocs-awesome-pages-plugin
-    - "pydata_sphinx_theme"
-    - mkdocs-mermaid2-plugin
-    - markdown-exec
-    - mkdocs-breadcrumbs-plugin
-    - mkdocs-minify-plugin
-    - mkdocs-open-in-new-tab
-    - mkdocs
-    - mkdocs-material[imaging]
-    - mkdocstrings[python]
-    - mkdocs-jupyter
-    - mkdocs-git-revision-date-localized-plugin
-    - mkdocs-git-committers-plugin-2
-    - mkdocs-macros-plugin
-    - mkdocs-include-markdown-plugin
-    - mkdocs-redirects
-    - mkdocs-git-authors-plugin
-    - "sphinx_copybutton"
-    - "sphinx_design"
-    - "sphinx_togglebutton"
-    - "sphinx-autodoc-typehints"
diff --git a/docs/docs/icechunk-python/performance.md b/docs/docs/icechunk-python/performance.md
@@ -0,0 +1,90 @@
+# Performance
+
+!!! info
+
+    This is advanced material, and you will need it only if you have arrays with more than a million chunks.
+    Icechunk aims to provide an excellent experience out of the box.
+
+## Preloading manifests
+
+Coming Soon.
+
+## Splitting manifests
+
+Icechunk stores chunk references in a chunk manifest file stored in `manifests/`.
+For very large arrays (millions of chunks), these files can get quite large.
+By default, Icechunk stores all chunk references in a single manifest file per array.
+Requesting even a single chunk requires downloading the entire manifest.
+In some cases, this can result in a slow time-to-first-byte or large memory usage.
+
+!!! note
+
+    Note that the chunk sizes in the following examples are tiny for demonstration purposes.
+
+To avoid that, Icechunk lets you split the manifest files by specifying a ``ManifestSplittingConfig``.
+
+```python exec="on" session="perf" source="material-block"
+import icechunk as ic
+from icechunk import ManifestSplitCondition, ManifestSplittingConfig, ManifestSplitDimCondition
+
+split_config = ManifestSplittingConfig.from_dict(
+    {
+        ManifestSplitCondition.AnyArray(): {
+            ManifestSplitDimCondition.DimensionName("time"): 365 * 24
+        }
+    }
+)
+repo_config = ic.RepositoryConfig(manifest=ic.ManifestConfig(splitting=split_config))
+```
+
+Then pass the config to `Repository.open` or `Repository.create`
+```python
+repo = ic.Repository.open(..., config=repo_config)
+```
+
+This particular example splits manifests so that each manifest contains `365 * 24` chunks along the time dimension, and every chunk along every other dimension in a single file.
+
+Options for specifying the arrays whose manifest you want to split are:
+
+1. [`ManifestSplitCondition.name_matches`](./reference.md#icechunk.ManifestSplitCondition.name_matches) takes a regular expression used to match an array's name;
+2. [`ManifestSplitCondition.path_matches`](./reference.md#icechunk.ManifestSplitCondition.path_matches) takes a regular expression used to match an array's path;
+3. [`ManifestSplitCondition.and_conditions`](./reference.md#icechunk.ManifestSplitCondition.and_conditions) to combine (1), (2), and (4) together; and
+4. [`ManifestSplitCondition.or_conditions`](./reference.md#icechunk.ManifestSplitCondition.or_conditions) to combine (1), (2), and (3) together.
+
+
+`And` and `Or` may be used to combine multiple path and/or name matches. For example,
+```python exec="on" session="perf" source="material-block"
+array_condition = ManifestSplitCondition.or_conditions(
+    [
+        ManifestSplitCondition.name_matches("temperature"),
+        ManifestSplitCondition.name_matches("salinity"),
+    ]
+)
+sconfig = ManifestSplittingConfig.from_dict(
+    {array_condition: {ManifestSplitDimCondition.DimensionName("longitude"): 3}}
+)
+```
+
+Options for specifying how to split along a specific axis or dimension are:
+
+1. [`ManifestSplitDimCondition.Axis`](./reference.md#icechunk.ManifestSplitDimCondition.Axis) takes an integer axis;
+2. [`ManifestSplitDimCondition.DimensionName`](./reference.md#icechunk.ManifestSplitDimCondition.DimensionName) takes a regular expression used to match the dimension names of the array;
+3. [`ManifestSplitDimCondition.Any`](./reference.md#icechunk.ManifestSplitDimCondition.Any) matches any _remaining_ dimension name or axis.
+
+
+For example, for an array with dimensions `time, latitude, longitude`, the following config
+```python exec="on" session="perf" source="material-block"
+from icechunk import ManifestSplitDimCondition
+
+{
+    ManifestSplitDimCondition.DimensionName("longitude"): 3,
+    ManifestSplitDimCondition.Axis(1): 2,
+    ManifestSplitDimCondition.Any(): 1,
+}
+```
+will result in splitting manifests so that each manifest contains (3 longitude chunks x 2 latitude chunks x 1 time chunk) = 6 chunks per manifest file.
+
+
+!!! note
+
+    Python dictionaries preserve insertion order, so the first condition encountered takes priority.
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -188,6 +188,7 @@ nav:
     - Quickstart: icechunk-python/quickstart.md
     - Configuration: icechunk-python/configuration.md
     - Storage: icechunk-python/storage.md
+    - Performance: icechunk-python/performance.md
     - FAQ: icechunk-python/faq.md
     - Xarray: icechunk-python/xarray.md
     - Parallel Writes: icechunk-python/parallel.md

diff --git a/icechunk-python/benchmarks/conftest.py b/icechunk-python/benchmarks/conftest.py
@@ -1,14 +1,13 @@
+import copy
 from typing import cast
 
 import pytest
 
 from benchmarks import helpers
 from benchmarks.datasets import (
-    ERA5,
-    ERA5_ARCO,
-    ERA5_SINGLE,
-    GB_8MB_CHUNKS,
-    GB_128MB_CHUNKS,
+    LARGE_1D,
+    LARGE_MANIFEST_SHARDED,
+    LARGE_MANIFEST_UNSHARDED,
     PANCAKE_WRITES,
     SIMPLE_1D,
     TEST_BUCKETS,
@@ -18,15 +17,22 @@
 )
 from icechunk import Repository, local_filesystem_storage
 
+try:
+    from icechunk import ManifestSplittingConfig  # noqa: F401
+
+    no_splitting = False
+except ImportError:
+    no_splitting = True
+
 
 def request_to_dataset(request, moar_prefix: str = "") -> Dataset:
     extra_prefix = request.config.getoption("--icechunk-prefix") + moar_prefix
     where = request.config.getoption("--where")
-    ds = request.param
+    ds = copy.deepcopy(request.param)
     if where == "local" and ds.skip_local:
         pytest.skip()
-    # for some reason, this gets run multiple times so we apply the prefix repeatedly
-    # if we don't catch that :(
+    # this gets run multiple times because the fixture scope is 'function'
+    # so we need this `force_idempotent` ugliness
     ds.storage_config = ds.storage_config.with_overwrite(
         **TEST_BUCKETS[where]
     ).with_extra(prefix=extra_prefix, force_idempotent=True)
@@ -50,13 +56,26 @@ def simple_write_dataset(request) -> BenchmarkWriteDataset:
     return cast(BenchmarkWriteDataset, ds)
 
 
+@pytest.fixture(params=[pytest.param(LARGE_1D, id="large-1d")])
+def large_write_dataset(request) -> BenchmarkWriteDataset:
+    moar_prefix = helpers.rdms()
+    ds = request_to_dataset(request, moar_prefix=moar_prefix)
+    return cast(BenchmarkWriteDataset, ds)
+
+
 @pytest.fixture(
     params=[
-        pytest.param(GB_8MB_CHUNKS, id="gb-8mb"),
-        pytest.param(GB_128MB_CHUNKS, id="gb-128mb"),
-        pytest.param(ERA5_SINGLE, id="era5-single"),
-        pytest.param(ERA5, id="era5-weatherbench"),
-        pytest.param(ERA5_ARCO, id="era5-arco"),
+        # pytest.param(GB_8MB_CHUNKS, id="gb-8mb"),
+        # pytest.param(GB_128MB_CHUNKS, id="gb-128mb"),
+        # pytest.param(ERA5_SINGLE, id="era5-single"),
+        # pytest.param(ERA5, id="era5-weatherbench"),
+        # pytest.param(ERA5_ARCO, id="era5-arco"),
+        pytest.param(LARGE_MANIFEST_UNSHARDED, id="large-manifest-no-split"),
+        pytest.param(
+            LARGE_MANIFEST_SHARDED,
+            id="large-manifest-split",
+            marks=pytest.mark.skipif(no_splitting, reason="no splitting"),
+        ),
     ],
 )
 def synth_dataset(request) -> BenchmarkReadDataset: