Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ This will add a second `VirtualChunkContainer` but not overwrite the first one t

The manifest configuration for the repository. [`ManifestConfig`](./reference.md#icechunk.ManifestConfig) allows you to configure behavior for how manifests are loaded. In particular, the `preload` parameter allows you to configure the preload behavior of the manifest using a [`ManifestPreloadConfig`](./reference.md#icechunk.ManifestPreloadConfig). This allows you to control the number of references that are loaded into memory when a session is created, along with which manifests are available to be preloaded.

The `ManifestPreloadConfig` accepts the following parameters:

- `max_total_refs`: Maximum total chunk references to preload across all manifests.
- `max_arrays_to_scan`: Maximum number of arrays to scan when looking for manifests to preload (default: 50). Increase this for repositories with many nested groups where coordinate arrays may appear later in the hierarchy.
- `preload_if`: A condition that determines which manifests should be preloaded.

#### Example

For example, if we have a repo which contains data that we plan to open as an [`Xarray`](./xarray.md) dataset, we may want to configure the manifest preload to only preload manifests that contain arrays that are coordinates, in our case `time`, `latitude`, and `longitude`.
Expand All @@ -124,6 +130,7 @@ For example, if we have a repo which contains data that we plan to open as an [`
config.manifest = icechunk.ManifestConfig(
preload=icechunk.ManifestPreloadConfig(
max_total_refs=100_000_000,
max_arrays_to_scan=1000,
preload_if=icechunk.ManifestPreloadCondition.name_matches(".*time|.*latitude|.*longitude"),
),
)
Expand Down
26 changes: 26 additions & 0 deletions icechunk-python/python/icechunk/_icechunk_python.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,7 @@ class ManifestPreloadConfig:
self,
max_total_refs: int | None = None,
preload_if: ManifestPreloadCondition | None = None,
max_arrays_to_scan: int | None = None,
) -> None:
"""
Create a new `ManifestPreloadConfig` object
Expand All @@ -592,6 +593,9 @@ class ManifestPreloadConfig:
The maximum number of references to preload.
preload_if: ManifestPreloadCondition | None
The condition under which manifests will be preloaded.
max_arrays_to_scan: int | None
The maximum number of arrays to scan when looking for manifests to preload.
Default is 50. Increase for repositories with many nested groups.
"""
...
@property
Expand Down Expand Up @@ -638,6 +642,28 @@ class ManifestPreloadConfig:
The condition under which manifests will be preloaded.
"""
...
@property
def max_arrays_to_scan(self) -> int | None:
"""
The maximum number of arrays to scan when looking for manifests to preload.

Returns
-------
int | None
The maximum number of arrays to scan. Default is 50.
"""
...
@max_arrays_to_scan.setter
def max_arrays_to_scan(self, value: int | None) -> None:
"""
Set the maximum number of arrays to scan when looking for manifests to preload.

Parameters
----------
value: int | None
The maximum number of arrays to scan.
"""
...

class ManifestSplitCondition:
"""Configuration for conditions under which manifests will be split into splits"""
Expand Down
9 changes: 7 additions & 2 deletions icechunk-python/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1100,17 +1100,20 @@ pub struct PyManifestPreloadConfig {
pub max_total_refs: Option<u32>,
#[pyo3(get, set)]
pub preload_if: Option<Py<PyManifestPreloadCondition>>,
#[pyo3(get, set)]
pub max_arrays_to_scan: Option<u32>,
}

#[pymethods]
impl PyManifestPreloadConfig {
#[new]
#[pyo3(signature = (max_total_refs=None, preload_if=None))]
#[pyo3(signature = (max_total_refs=None, preload_if=None, max_arrays_to_scan=None))]
fn new(
max_total_refs: Option<u32>,
preload_if: Option<Py<PyManifestPreloadCondition>>,
max_arrays_to_scan: Option<u32>,
) -> Self {
Self { max_total_refs, preload_if }
Self { max_total_refs, preload_if, max_arrays_to_scan }
}
}

Expand All @@ -1127,6 +1130,7 @@ impl From<&PyManifestPreloadConfig> for ManifestPreloadConfig {
Python::attach(|py| Self {
max_total_refs: value.max_total_refs,
preload_if: value.preload_if.as_ref().map(|c| (&*c.borrow(py)).into()),
max_arrays_to_scan: value.max_arrays_to_scan,
})
}
}
Expand All @@ -1140,6 +1144,7 @@ impl From<ManifestPreloadConfig> for PyManifestPreloadConfig {
Py::new(py, Into::<PyManifestPreloadCondition>::into(c))
.expect("Cannot create instance of ManifestPreloadCondition")
}),
max_arrays_to_scan: value.max_arrays_to_scan,
})
}
}
Expand Down
9 changes: 8 additions & 1 deletion icechunk/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,20 +264,26 @@ pub enum ManifestPreloadCondition {
pub struct ManifestPreloadConfig {
pub max_total_refs: Option<u32>,
pub preload_if: Option<ManifestPreloadCondition>,
pub max_arrays_to_scan: Option<u32>,
}

impl ManifestPreloadConfig {
pub fn merge(&self, other: Self) -> Self {
Self {
max_total_refs: other.max_total_refs.or(self.max_total_refs),
preload_if: other.preload_if.or(self.preload_if.clone()),
max_arrays_to_scan: other.max_arrays_to_scan.or(self.max_arrays_to_scan),
}
}

pub fn max_total_refs(&self) -> u32 {
self.max_total_refs.unwrap_or(10_000)
}

pub fn max_arrays_to_scan(&self) -> u32 {
self.max_arrays_to_scan.unwrap_or(50)
}

pub fn preload_if(&self) -> &ManifestPreloadCondition {
self.preload_if.as_ref().unwrap_or_else(|| {
DEFAULT_MANIFEST_PRELOAD_CONDITION.get_or_init(|| {
Expand Down Expand Up @@ -355,12 +361,13 @@ impl ManifestConfig {
.get_or_init(ManifestSplittingConfig::default)
})
}
// for testing only, create a config with no preloading and no splitting
// for testing only, create a config with no preloading, no splitting, and no max_arrays to scan
pub fn empty() -> Self {
ManifestConfig {
preload: Some(ManifestPreloadConfig {
max_total_refs: None,
preload_if: None,
max_arrays_to_scan: None,
}),
splitting: None,
}
Expand Down
7 changes: 5 additions & 2 deletions icechunk/src/repository.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1534,6 +1534,7 @@ impl Repository {
debug!("Preloading manifests");
let asset_manager = Arc::clone(self.asset_manager());
let preload_config = self.config().manifest().preload().clone();
let max_arrays_to_scan = preload_config.max_arrays_to_scan() as usize;
if preload_config.max_total_refs() == 0
|| matches!(preload_config.preload_if(), ManifestPreloadCondition::False)
{
Expand All @@ -1549,8 +1550,7 @@ impl Repository {
for node in snap
.iter_arc(&Path::root())
.filter_ok(|node| node.node_type() == NodeType::Array)
// TODO: make configurable
.take(50)
.take(max_arrays_to_scan)
{
match node {
Err(err) => {
Expand Down Expand Up @@ -1937,6 +1937,7 @@ mod tests {
preload: Some(ManifestPreloadConfig {
max_total_refs: None,
preload_if: None,
max_arrays_to_scan: None,
}),
splitting: Some(split_config.clone()),
};
Expand All @@ -1962,6 +1963,7 @@ mod tests {
preload: Some(ManifestPreloadConfig {
max_total_refs: None,
preload_if: None,
max_arrays_to_scan: None,
}),
splitting: Some(split_config.clone()),
};
Expand Down Expand Up @@ -3225,6 +3227,7 @@ mod tests {
preload: Some(ManifestPreloadConfig {
max_total_refs: Some(2),
preload_if: None,
max_arrays_to_scan: None,
}),
..ManifestConfig::default()
};
Expand Down
5 changes: 3 additions & 2 deletions icechunk/src/strategies.rs
Original file line number Diff line number Diff line change
Expand Up @@ -281,9 +281,10 @@ pub fn manifest_split_condition() -> BoxedStrategy<ManifestSplitCondition> {
prop_compose! {
pub fn manifest_preload_config()
(max_total_refs in option::of(any::<u32>()),
preload_if in option::of(manifest_preload_condition())
preload_if in option::of(manifest_preload_condition()),
max_arrays_to_scan in option::of(any::<u32>())
) -> ManifestPreloadConfig {
ManifestPreloadConfig { max_total_refs, preload_if }
ManifestPreloadConfig { max_total_refs, preload_if, max_arrays_to_scan}
}
}

Expand Down
Loading