Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions icechunk-python/python/icechunk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
GCSummary,
IcechunkError,
ManifestConfig,
ManifestFileInfo,
ManifestPreloadCondition,
ManifestPreloadConfig,
ObjectStoreConfig,
Expand Down Expand Up @@ -104,6 +105,7 @@
"IcechunkError",
"IcechunkStore",
"ManifestConfig",
"ManifestFileInfo",
"ManifestPreloadCondition",
"ManifestPreloadConfig",
"ObjectStoreConfig",
Expand Down
22 changes: 22 additions & 0 deletions icechunk-python/python/icechunk/_icechunk_python.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1097,6 +1097,22 @@ class PyAsyncStringGenerator(AsyncGenerator[str, None], metaclass=abc.ABCMeta):
def __aiter__(self) -> PyAsyncStringGenerator: ...
async def __anext__(self) -> str: ...

class ManifestFileInfo:
"""Manifest file metadata"""

@property
def id(self) -> str:
"""The manifest id"""
...
@property
def size_bytes(self) -> int:
"""The size in bytes of the"""
...
@property
def num_chunk_refs(self) -> int:
"""The number of chunk references contained in this manifest"""
...

class SnapshotInfo:
"""Metadata for a snapshot"""
@property
Expand Down Expand Up @@ -1125,6 +1141,12 @@ class SnapshotInfo:
The metadata of the snapshot
"""
...
@property
def manifests(self) -> list[ManifestFileInfo]:
"""
The manifests linked to this snapshot
"""
...

class PyAsyncSnapshotGenerator(AsyncGenerator[SnapshotInfo, None], metaclass=abc.ABCMeta):
def __aiter__(self) -> PyAsyncSnapshotGenerator: ...
Expand Down
3 changes: 2 additions & 1 deletion icechunk-python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use errors::{
use icechunk::{format::format_constants::SpecVersionBin, initialize_tracing};
use pyo3::prelude::*;
use pyo3::wrap_pyfunction;
use repository::{PyDiff, PyGCSummary, PyRepository, PySnapshotInfo};
use repository::{PyDiff, PyGCSummary, PyManifestFileInfo, PyRepository, PySnapshotInfo};
use session::PySession;
use store::{PyStore, VirtualChunkSpec};

Expand Down Expand Up @@ -93,6 +93,7 @@ fn _icechunk_python(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<PySession>()?;
m.add_class::<PyStore>()?;
m.add_class::<PySnapshotInfo>()?;
m.add_class::<PyManifestFileInfo>()?;
m.add_class::<PyConflictSolver>()?;
m.add_class::<PyBasicConflictSolver>()?;
m.add_class::<PyConflictDetector>()?;
Expand Down
37 changes: 35 additions & 2 deletions icechunk-python/src/repository.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use icechunk::{
config::Credentials,
format::{
SnapshotId,
snapshot::{SnapshotInfo, SnapshotProperties},
snapshot::{ManifestFileInfo, SnapshotInfo, SnapshotProperties},
transaction_log::Diff,
},
ops::{
Expand Down Expand Up @@ -61,6 +61,16 @@ pub struct PySnapshotInfo {
message: String,
#[pyo3(get)]
metadata: PySnapshotProperties,
#[pyo3(get)]
manifests: Vec<PyManifestFileInfo>,
}

#[pyclass(name = "ManifestFileInfo", eq)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct PyManifestFileInfo {
pub id: String,
pub size_bytes: u64,
pub num_chunk_refs: u32,
}

impl<'py> FromPyObject<'py> for PySnapshotProperties {
Expand Down Expand Up @@ -161,18 +171,41 @@ impl From<PySnapshotProperties> for SnapshotProperties {
}
}

impl From<ManifestFileInfo> for PyManifestFileInfo {
fn from(val: ManifestFileInfo) -> Self {
Self {
id: val.id.to_string(),
size_bytes: val.size_bytes,
num_chunk_refs: val.num_chunk_refs,
}
}
}

impl From<SnapshotInfo> for PySnapshotInfo {
fn from(val: SnapshotInfo) -> Self {
PySnapshotInfo {
Self {
id: val.id.to_string(),
parent_id: val.parent_id.map(|id| id.to_string()),
written_at: val.flushed_at,
message: val.message,
metadata: val.metadata.into(),
manifests: val.manifests.into_iter().map(|v| v.into()).collect(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this potentially a really big list? that will now be listed our with ancestry?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should have SnapshotInfo.list_manifests() instead?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, but, ancestry is very slow anyway, since it fetches each snapshot from object store. Gathering the few manifests cannot add much

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think the list of manifests could get very long with split? Don't forget this is only done when you try to print a snapshot, not something you would do a lot, or would expect to be fast.

Copy link
Collaborator Author

@paraseba paraseba Apr 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scratch, my latest about "print". What I should have said is: these objects get populated only by ancestry, which is very slow, and they are mostly used and discarded, I don't imagine gathering a few hundred extra objects will become a problem

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I understand now. My goal with this was trying to give us a bit more insight to debug, using the lookup_snapshot method. I like the idea of having a dedicated ancestry representation, that could be text in the console, html in a notebook, etc

Copy link
Collaborator Author

@paraseba paraseba Apr 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An updating weather forecast repo might have O(100) manifests in each snapshot, one per array, for example.

But we are not fetching those manifests, only surfacing the data that is already in the snapshot, maybe 30 bytes per manifest or so.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But this definitely has an impact on the repr for snapshots. Do you want me to skip manifests there? we can do that

Copy link
Contributor

@dcherian dcherian Apr 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I'm only concerned about the repr, because we use it for ancestry right now.

Yes, let's just comment it out from the repr for now.

This will help for my update to test_can_read_old.py

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in f9baf05

}
}
}

#[pymethods]
impl PyManifestFileInfo {
pub fn __repr__(&self) -> String {
format!(
r#"ManifestFileInfo(id="{id}", size_bytes={size}, num_chunk_refs={chunks})"#,
id = self.id,
size = self.size_bytes,
chunks = self.num_chunk_refs,
)
}
}

#[pymethods]
impl PySnapshotInfo {
pub fn __repr__(&self) -> String {
Expand Down
1 change: 1 addition & 0 deletions icechunk-python/tests/test_timetravel.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def test_timetravel() -> None:
"commit 1",
"Repository initialized",
]
assert [len(snap.manifests) for snap in parents] == [1, 1, 1, 0]
assert sorted(parents, key=lambda p: p.written_at) == list(reversed(parents))
assert len(set([snap.id for snap in parents])) == 4
assert list(repo.ancestry(tag="v1.0")) == parents
Expand Down
11 changes: 10 additions & 1 deletion icechunk/src/format/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,20 @@ pub struct ChunkInfo {
pub payload: ChunkPayload,
}

#[derive(Debug)]
#[derive(PartialEq)]
pub struct Manifest {
buffer: Vec<u8>,
}

impl std::fmt::Debug for Manifest {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Manifest")
.field("id", &self.id())
.field("chunks", &self.len())
.finish_non_exhaustive()
}
}

impl Manifest {
pub fn id(&self) -> ManifestId {
ManifestId::new(self.root().id().0)
Expand Down
3 changes: 1 addition & 2 deletions icechunk/src/format/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ use ::flatbuffers::InvalidFlatbuffer;
use bytes::Bytes;
use flatbuffers::generated;
use format_constants::FileTypeBin;
use itertools::Itertools;
use manifest::{VirtualReferenceError, VirtualReferenceErrorKind};
use rand::{Rng, rng};
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -108,7 +107,7 @@ impl<const SIZE: usize, T: FileTypeTag> ObjectId<SIZE, T> {

impl<const SIZE: usize, T: FileTypeTag> fmt::Debug for ObjectId<SIZE, T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{:02x}", self.0.iter().format(""))
write!(f, "{}", String::from(self))
}
}

Expand Down
24 changes: 19 additions & 5 deletions icechunk/src/format/snapshot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,18 +251,35 @@ impl ManifestFileInfo {
}
}

#[derive(Debug, PartialEq)]
#[derive(PartialEq)]
pub struct Snapshot {
buffer: Vec<u8>,
}

impl std::fmt::Debug for Snapshot {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let nodes =
self.iter().map(|n| n.map(|n| n.path.to_string())).collect::<Vec<_>>();
f.debug_struct("Snapshot")
.field("id", &self.id())
.field("parent_id", &self.parent_id())
.field("flushed_at", &self.flushed_at())
.field("nodes", &nodes)
.field("manifests", &self.manifest_files().collect::<Vec<_>>())
.field("message", &self.message())
.field("metadata", &self.metadata())
.finish_non_exhaustive()
}
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SnapshotInfo {
pub id: SnapshotId,
pub parent_id: Option<SnapshotId>,
pub flushed_at: DateTime<chrono::Utc>,
pub message: String,
pub metadata: SnapshotProperties,
pub manifests: Vec<ManifestFileInfo>,
}

impl TryFrom<&Snapshot> for SnapshotInfo {
Expand All @@ -275,6 +292,7 @@ impl TryFrom<&Snapshot> for SnapshotInfo {
flushed_at: value.flushed_at()?,
message: value.message().to_string(),
metadata: value.metadata()?.clone(),
manifests: value.manifest_files().collect(),
})
}
}
Expand Down Expand Up @@ -432,10 +450,6 @@ impl Snapshot {
self.root().message().to_string()
}

// pub fn nodes(&self) -> &BTreeMap<Path, NodeSnapshot> {
// &self.nodes
// }

pub fn get_manifest_file(&self, id: &ManifestId) -> Option<ManifestFileInfo> {
self.root().manifest_files().iter().find(|mf| mf.id().0 == id.0.as_slice()).map(
|mf| ManifestFileInfo {
Expand Down