diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md index b833b4335..f9d876877 100644 --- a/docs/docs/configuration.md +++ b/docs/docs/configuration.md @@ -116,6 +116,12 @@ This will add a second `VirtualChunkContainer` but not overwrite the first one t The manifest configuration for the repository. [`ManifestConfig`](./reference.md#icechunk.ManifestConfig) allows you to configure behavior for how manifests are loaded. In particular, the `preload` parameter allows you to configure the preload behavior of the manifest using a [`ManifestPreloadConfig`](./reference.md#icechunk.ManifestPreloadConfig). This allows you to control the number of references that are loaded into memory when a session is created, along with which manifests are available to be preloaded. +The `ManifestPreloadConfig` accepts the following parameters: + +- `max_total_refs`: Maximum total chunk references to preload across all manifests. +- `max_arrays_to_scan`: Maximum number of arrays to scan when looking for manifests to preload (default: 50). Increase this for repositories with many nested groups where coordinate arrays may appear later in the hierarchy. +- `preload_if`: A condition that determines which manifests should be preloaded. + #### Example For example, if we have a repo which contains data that we plan to open as an [`Xarray`](./xarray.md) dataset, we may want to configure the manifest preload to only preload manifests that contain arrays that are coordinates, in our case `time`, `latitude`, and `longitude`. @@ -124,6 +130,7 @@ For example, if we have a repo which contains data that we plan to open as an [` config.manifest = icechunk.ManifestConfig( preload=icechunk.ManifestPreloadConfig( max_total_refs=100_000_000, + max_arrays_to_scan=1000, preload_if=icechunk.ManifestPreloadCondition.name_matches(".*time|.*latitude|.*longitude"), ), ) diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi index 43b0c51cf..743017d02 100644 --- a/icechunk-python/python/icechunk/_icechunk_python.pyi +++ b/icechunk-python/python/icechunk/_icechunk_python.pyi @@ -582,6 +582,7 @@ class ManifestPreloadConfig: self, max_total_refs: int | None = None, preload_if: ManifestPreloadCondition | None = None, + max_arrays_to_scan: int | None = None, ) -> None: """ Create a new `ManifestPreloadConfig` object @@ -592,6 +593,9 @@ class ManifestPreloadConfig: The maximum number of references to preload. preload_if: ManifestPreloadCondition | None The condition under which manifests will be preloaded. + max_arrays_to_scan: int | None + The maximum number of arrays to scan when looking for manifests to preload. + Default is 50. Increase for repositories with many nested groups. """ ... @property @@ -638,6 +642,28 @@ class ManifestPreloadConfig: The condition under which manifests will be preloaded. """ ... + @property + def max_arrays_to_scan(self) -> int | None: + """ + The maximum number of arrays to scan when looking for manifests to preload. + + Returns + ------- + int | None + The maximum number of arrays to scan. Default is 50. + """ + ... + @max_arrays_to_scan.setter + def max_arrays_to_scan(self, value: int | None) -> None: + """ + Set the maximum number of arrays to scan when looking for manifests to preload. + + Parameters + ---------- + value: int | None + The maximum number of arrays to scan. + """ + ... class ManifestSplitCondition: """Configuration for conditions under which manifests will be split into splits""" diff --git a/icechunk-python/src/config.rs b/icechunk-python/src/config.rs index 3c17a3b91..68e80a0c1 100644 --- a/icechunk-python/src/config.rs +++ b/icechunk-python/src/config.rs @@ -1100,17 +1100,20 @@ pub struct PyManifestPreloadConfig { pub max_total_refs: Option, #[pyo3(get, set)] pub preload_if: Option>, + #[pyo3(get, set)] + pub max_arrays_to_scan: Option, } #[pymethods] impl PyManifestPreloadConfig { #[new] - #[pyo3(signature = (max_total_refs=None, preload_if=None))] + #[pyo3(signature = (max_total_refs=None, preload_if=None, max_arrays_to_scan=None))] fn new( max_total_refs: Option, preload_if: Option>, + max_arrays_to_scan: Option, ) -> Self { - Self { max_total_refs, preload_if } + Self { max_total_refs, preload_if, max_arrays_to_scan } } } @@ -1127,6 +1130,7 @@ impl From<&PyManifestPreloadConfig> for ManifestPreloadConfig { Python::attach(|py| Self { max_total_refs: value.max_total_refs, preload_if: value.preload_if.as_ref().map(|c| (&*c.borrow(py)).into()), + max_arrays_to_scan: value.max_arrays_to_scan, }) } } @@ -1140,6 +1144,7 @@ impl From for PyManifestPreloadConfig { Py::new(py, Into::::into(c)) .expect("Cannot create instance of ManifestPreloadCondition") }), + max_arrays_to_scan: value.max_arrays_to_scan, }) } } diff --git a/icechunk/src/config.rs b/icechunk/src/config.rs index 4fd9ebaaf..a5f80bda0 100644 --- a/icechunk/src/config.rs +++ b/icechunk/src/config.rs @@ -264,6 +264,7 @@ pub enum ManifestPreloadCondition { pub struct ManifestPreloadConfig { pub max_total_refs: Option, pub preload_if: Option, + pub max_arrays_to_scan: Option, } impl ManifestPreloadConfig { @@ -271,6 +272,7 @@ impl ManifestPreloadConfig { Self { max_total_refs: other.max_total_refs.or(self.max_total_refs), preload_if: other.preload_if.or(self.preload_if.clone()), + max_arrays_to_scan: other.max_arrays_to_scan.or(self.max_arrays_to_scan), } } @@ -278,6 +280,10 @@ impl ManifestPreloadConfig { self.max_total_refs.unwrap_or(10_000) } + pub fn max_arrays_to_scan(&self) -> u32 { + self.max_arrays_to_scan.unwrap_or(50) + } + pub fn preload_if(&self) -> &ManifestPreloadCondition { self.preload_if.as_ref().unwrap_or_else(|| { DEFAULT_MANIFEST_PRELOAD_CONDITION.get_or_init(|| { @@ -355,12 +361,13 @@ impl ManifestConfig { .get_or_init(ManifestSplittingConfig::default) }) } - // for testing only, create a config with no preloading and no splitting + // for testing only, create a config with no preloading, no splitting, and no max_arrays to scan pub fn empty() -> Self { ManifestConfig { preload: Some(ManifestPreloadConfig { max_total_refs: None, preload_if: None, + max_arrays_to_scan: None, }), splitting: None, } diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index 77a52a3a5..6199effe6 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -1534,6 +1534,7 @@ impl Repository { debug!("Preloading manifests"); let asset_manager = Arc::clone(self.asset_manager()); let preload_config = self.config().manifest().preload().clone(); + let max_arrays_to_scan = preload_config.max_arrays_to_scan() as usize; if preload_config.max_total_refs() == 0 || matches!(preload_config.preload_if(), ManifestPreloadCondition::False) { @@ -1549,8 +1550,7 @@ impl Repository { for node in snap .iter_arc(&Path::root()) .filter_ok(|node| node.node_type() == NodeType::Array) - // TODO: make configurable - .take(50) + .take(max_arrays_to_scan) { match node { Err(err) => { @@ -1937,6 +1937,7 @@ mod tests { preload: Some(ManifestPreloadConfig { max_total_refs: None, preload_if: None, + max_arrays_to_scan: None, }), splitting: Some(split_config.clone()), }; @@ -1962,6 +1963,7 @@ mod tests { preload: Some(ManifestPreloadConfig { max_total_refs: None, preload_if: None, + max_arrays_to_scan: None, }), splitting: Some(split_config.clone()), }; @@ -3225,6 +3227,7 @@ mod tests { preload: Some(ManifestPreloadConfig { max_total_refs: Some(2), preload_if: None, + max_arrays_to_scan: None, }), ..ManifestConfig::default() }; diff --git a/icechunk/src/strategies.rs b/icechunk/src/strategies.rs index 5d0cdf13f..4361a9f2b 100644 --- a/icechunk/src/strategies.rs +++ b/icechunk/src/strategies.rs @@ -281,9 +281,10 @@ pub fn manifest_split_condition() -> BoxedStrategy { prop_compose! { pub fn manifest_preload_config() (max_total_refs in option::of(any::()), - preload_if in option::of(manifest_preload_condition()) + preload_if in option::of(manifest_preload_condition()), + max_arrays_to_scan in option::of(any::()) ) -> ManifestPreloadConfig { - ManifestPreloadConfig { max_total_refs, preload_if } + ManifestPreloadConfig { max_total_refs, preload_if, max_arrays_to_scan} } }