From 3552624d654c3da54229499b23e17e6b581f7518 Mon Sep 17 00:00:00 2001 From: aladinor Date: Thu, 18 Dec 2025 09:26:29 -0600 Subject: [PATCH 1/5] Add configurable max_arrays_to_scan to ManifestPreloadConfig (#1464) Allow users to configure the maximum number of arrays scanned during manifest preloading. Previously hardcoded to 50, this limit could cause preloading to stop early for repositories with many nested groups. Default remains 50 for backward compatibility. --- Changelog.python.md | 8 ++++++++ docs/docs/configuration.md | 7 +++++++ icechunk-python/src/config.rs | 9 +++++++-- icechunk/src/config.rs | 7 ++++++- icechunk/src/repository.rs | 7 +++++-- icechunk/src/strategies.rs | 5 +++-- 6 files changed, 36 insertions(+), 7 deletions(-) diff --git a/Changelog.python.md b/Changelog.python.md index 9c2168a04..8203dc292 100644 --- a/Changelog.python.md +++ b/Changelog.python.md @@ -1,5 +1,13 @@ # Changelog +## Python Icechunk Library 1.1.12 + +### Features + +- Added `max_arrays_to_scan` parameter to `ManifestPreloadConfig` to control how many arrays + are scanned when looking for manifests to preload. This is useful for repositories with many + nested groups where coordinate arrays may appear later in the hierarchy. Default is 50. + ## Python Icechunk Library 1.1.11 ### Features diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md index b833b4335..c5a80c484 100644 --- a/docs/docs/configuration.md +++ b/docs/docs/configuration.md @@ -116,6 +116,12 @@ This will add a second `VirtualChunkContainer` but not overwrite the first one t The manifest configuration for the repository. [`ManifestConfig`](./reference.md#icechunk.ManifestConfig) allows you to configure behavior for how manifests are loaded. In particular, the `preload` parameter allows you to configure the preload behavior of the manifest using a [`ManifestPreloadConfig`](./reference.md#icechunk.ManifestPreloadConfig). This allows you to control the number of references that are loaded into memory when a session is created, along with which manifests are available to be preloaded. +The `ManifestPreloadConfig` accepts the following parameters: + +- `max_total_refs`: Maximum total chunk references to preload across all manifests. +- `max_arrays_to_scan`: Maximum number of arrays to scan when looking for manifests to preload (default: 50). Increase this for repositories with many nested groups where coordinate arrays may appear later in the hierarchy. +- `preload_if`: A condition that determines which manifests should be preloaded. + #### Example For example, if we have a repo which contains data that we plan to open as an [`Xarray`](./xarray.md) dataset, we may want to configure the manifest preload to only preload manifests that contain arrays that are coordinates, in our case `time`, `latitude`, and `longitude`. @@ -124,6 +130,7 @@ For example, if we have a repo which contains data that we plan to open as an [` config.manifest = icechunk.ManifestConfig( preload=icechunk.ManifestPreloadConfig( max_total_refs=100_000_000, + max_arrays_to_scan=1000, preload_if=icechunk.ManifestPreloadCondition.name_matches(".*time|.*latitude|.*longitude"), ), ) diff --git a/icechunk-python/src/config.rs b/icechunk-python/src/config.rs index 3c17a3b91..72cc0b014 100644 --- a/icechunk-python/src/config.rs +++ b/icechunk-python/src/config.rs @@ -1100,17 +1100,20 @@ pub struct PyManifestPreloadConfig { pub max_total_refs: Option, #[pyo3(get, set)] pub preload_if: Option>, + #[pyo3(get, set)] + pub max_arrays_to_scan: Option } #[pymethods] impl PyManifestPreloadConfig { #[new] - #[pyo3(signature = (max_total_refs=None, preload_if=None))] + #[pyo3(signature = (max_total_refs=None, preload_if=None, max_arrays_to_scan=None))] fn new( max_total_refs: Option, preload_if: Option>, + max_arrays_to_scan: Option ) -> Self { - Self { max_total_refs, preload_if } + Self { max_total_refs, preload_if, max_arrays_to_scan } } } @@ -1127,6 +1130,7 @@ impl From<&PyManifestPreloadConfig> for ManifestPreloadConfig { Python::attach(|py| Self { max_total_refs: value.max_total_refs, preload_if: value.preload_if.as_ref().map(|c| (&*c.borrow(py)).into()), + max_arrays_to_scan: value.max_arrays_to_scan, }) } } @@ -1140,6 +1144,7 @@ impl From for PyManifestPreloadConfig { Py::new(py, Into::::into(c)) .expect("Cannot create instance of ManifestPreloadCondition") }), + max_arrays_to_scan: value.max_arrays_to_scan, }) } } diff --git a/icechunk/src/config.rs b/icechunk/src/config.rs index 4fd9ebaaf..366198938 100644 --- a/icechunk/src/config.rs +++ b/icechunk/src/config.rs @@ -264,6 +264,7 @@ pub enum ManifestPreloadCondition { pub struct ManifestPreloadConfig { pub max_total_refs: Option, pub preload_if: Option, + pub max_arrays_to_scan: Option } impl ManifestPreloadConfig { @@ -271,6 +272,7 @@ impl ManifestPreloadConfig { Self { max_total_refs: other.max_total_refs.or(self.max_total_refs), preload_if: other.preload_if.or(self.preload_if.clone()), + max_arrays_to_scan: other.max_arrays_to_scan.or(self.max_arrays_to_scan), } } @@ -278,6 +280,8 @@ impl ManifestPreloadConfig { self.max_total_refs.unwrap_or(10_000) } + pub fn max_arrays_to_scan(&self) -> u32 {self.max_arrays_to_scan.unwrap_or(50)} + pub fn preload_if(&self) -> &ManifestPreloadCondition { self.preload_if.as_ref().unwrap_or_else(|| { DEFAULT_MANIFEST_PRELOAD_CONDITION.get_or_init(|| { @@ -355,12 +359,13 @@ impl ManifestConfig { .get_or_init(ManifestSplittingConfig::default) }) } - // for testing only, create a config with no preloading and no splitting + // for testing only, create a config with no preloading, no splitting, and no max_arrays to scan pub fn empty() -> Self { ManifestConfig { preload: Some(ManifestPreloadConfig { max_total_refs: None, preload_if: None, + max_arrays_to_scan: None, }), splitting: None, } diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index 00b71d05d..c2f1dbb1e 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -1520,6 +1520,7 @@ impl Repository { debug!("Preloading manifests"); let asset_manager = Arc::clone(self.asset_manager()); let preload_config = self.config().manifest().preload().clone(); + let max_arrays_to_scan = preload_config.max_arrays_to_scan() as usize; if preload_config.max_total_refs() == 0 || matches!(preload_config.preload_if(), ManifestPreloadCondition::False) { @@ -1535,8 +1536,7 @@ impl Repository { for node in snap .iter_arc(&Path::root()) .filter_ok(|node| node.node_type() == NodeType::Array) - // TODO: make configurable - .take(50) + .take(max_arrays_to_scan) { match node { Err(err) => { @@ -1923,6 +1923,7 @@ mod tests { preload: Some(ManifestPreloadConfig { max_total_refs: None, preload_if: None, + max_arrays_to_scan: None, }), splitting: Some(split_config.clone()), }; @@ -1948,6 +1949,7 @@ mod tests { preload: Some(ManifestPreloadConfig { max_total_refs: None, preload_if: None, + max_arrays_to_scan: None, }), splitting: Some(split_config.clone()), }; @@ -3211,6 +3213,7 @@ mod tests { preload: Some(ManifestPreloadConfig { max_total_refs: Some(2), preload_if: None, + max_arrays_to_scan: None, }), ..ManifestConfig::default() }; diff --git a/icechunk/src/strategies.rs b/icechunk/src/strategies.rs index 5d0cdf13f..4361a9f2b 100644 --- a/icechunk/src/strategies.rs +++ b/icechunk/src/strategies.rs @@ -281,9 +281,10 @@ pub fn manifest_split_condition() -> BoxedStrategy { prop_compose! { pub fn manifest_preload_config() (max_total_refs in option::of(any::()), - preload_if in option::of(manifest_preload_condition()) + preload_if in option::of(manifest_preload_condition()), + max_arrays_to_scan in option::of(any::()) ) -> ManifestPreloadConfig { - ManifestPreloadConfig { max_total_refs, preload_if } + ManifestPreloadConfig { max_total_refs, preload_if, max_arrays_to_scan} } } From 4ac00d30e544dd21406dda6879588b93deed896b Mon Sep 17 00:00:00 2001 From: aladinor Date: Thu, 18 Dec 2025 09:39:54 -0600 Subject: [PATCH 2/5] Fix formatting issues --- icechunk-python/src/config.rs | 4 ++-- icechunk/src/config.rs | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/icechunk-python/src/config.rs b/icechunk-python/src/config.rs index 72cc0b014..68e80a0c1 100644 --- a/icechunk-python/src/config.rs +++ b/icechunk-python/src/config.rs @@ -1101,7 +1101,7 @@ pub struct PyManifestPreloadConfig { #[pyo3(get, set)] pub preload_if: Option>, #[pyo3(get, set)] - pub max_arrays_to_scan: Option + pub max_arrays_to_scan: Option, } #[pymethods] @@ -1111,7 +1111,7 @@ impl PyManifestPreloadConfig { fn new( max_total_refs: Option, preload_if: Option>, - max_arrays_to_scan: Option + max_arrays_to_scan: Option, ) -> Self { Self { max_total_refs, preload_if, max_arrays_to_scan } } diff --git a/icechunk/src/config.rs b/icechunk/src/config.rs index 366198938..a5f80bda0 100644 --- a/icechunk/src/config.rs +++ b/icechunk/src/config.rs @@ -264,7 +264,7 @@ pub enum ManifestPreloadCondition { pub struct ManifestPreloadConfig { pub max_total_refs: Option, pub preload_if: Option, - pub max_arrays_to_scan: Option + pub max_arrays_to_scan: Option, } impl ManifestPreloadConfig { @@ -280,7 +280,9 @@ impl ManifestPreloadConfig { self.max_total_refs.unwrap_or(10_000) } - pub fn max_arrays_to_scan(&self) -> u32 {self.max_arrays_to_scan.unwrap_or(50)} + pub fn max_arrays_to_scan(&self) -> u32 { + self.max_arrays_to_scan.unwrap_or(50) + } pub fn preload_if(&self) -> &ManifestPreloadCondition { self.preload_if.as_ref().unwrap_or_else(|| { From 49e5bc0d635bbf020e95f37f6bb3eb1dbd031036 Mon Sep 17 00:00:00 2001 From: aladinor Date: Thu, 18 Dec 2025 10:07:23 -0600 Subject: [PATCH 3/5] Fix trailing whitespace in configuration.md --- docs/docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md index c5a80c484..f9d876877 100644 --- a/docs/docs/configuration.md +++ b/docs/docs/configuration.md @@ -130,7 +130,7 @@ For example, if we have a repo which contains data that we plan to open as an [` config.manifest = icechunk.ManifestConfig( preload=icechunk.ManifestPreloadConfig( max_total_refs=100_000_000, - max_arrays_to_scan=1000, + max_arrays_to_scan=1000, preload_if=icechunk.ManifestPreloadCondition.name_matches(".*time|.*latitude|.*longitude"), ), ) From d89d08115ad47d393a0f00b4cb660e6afac1322e Mon Sep 17 00:00:00 2001 From: aladinor Date: Thu, 18 Dec 2025 10:14:51 -0600 Subject: [PATCH 4/5] Add max_arrays_to_scan to Python type stubs --- .../python/icechunk/_icechunk_python.pyi | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi index 43b0c51cf..743017d02 100644 --- a/icechunk-python/python/icechunk/_icechunk_python.pyi +++ b/icechunk-python/python/icechunk/_icechunk_python.pyi @@ -582,6 +582,7 @@ class ManifestPreloadConfig: self, max_total_refs: int | None = None, preload_if: ManifestPreloadCondition | None = None, + max_arrays_to_scan: int | None = None, ) -> None: """ Create a new `ManifestPreloadConfig` object @@ -592,6 +593,9 @@ class ManifestPreloadConfig: The maximum number of references to preload. preload_if: ManifestPreloadCondition | None The condition under which manifests will be preloaded. + max_arrays_to_scan: int | None + The maximum number of arrays to scan when looking for manifests to preload. + Default is 50. Increase for repositories with many nested groups. """ ... @property @@ -638,6 +642,28 @@ class ManifestPreloadConfig: The condition under which manifests will be preloaded. """ ... + @property + def max_arrays_to_scan(self) -> int | None: + """ + The maximum number of arrays to scan when looking for manifests to preload. + + Returns + ------- + int | None + The maximum number of arrays to scan. Default is 50. + """ + ... + @max_arrays_to_scan.setter + def max_arrays_to_scan(self, value: int | None) -> None: + """ + Set the maximum number of arrays to scan when looking for manifests to preload. + + Parameters + ---------- + value: int | None + The maximum number of arrays to scan. + """ + ... class ManifestSplitCondition: """Configuration for conditions under which manifests will be split into splits""" From e35afe1f68fd5c190cacd80cd8bb96995b33f3c6 Mon Sep 17 00:00:00 2001 From: aladinor Date: Thu, 18 Dec 2025 10:21:54 -0600 Subject: [PATCH 5/5] Remove premature changelog entry per reviewer feedback --- Changelog.python.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Changelog.python.md b/Changelog.python.md index 8203dc292..9c2168a04 100644 --- a/Changelog.python.md +++ b/Changelog.python.md @@ -1,13 +1,5 @@ # Changelog -## Python Icechunk Library 1.1.12 - -### Features - -- Added `max_arrays_to_scan` parameter to `ManifestPreloadConfig` to control how many arrays - are scanned when looking for manifests to preload. This is useful for repositories with many - nested groups where coordinate arrays may appear later in the hierarchy. Default is 50. - ## Python Icechunk Library 1.1.11 ### Features