diff --git a/icechunk-python/python/icechunk/__init__.py b/icechunk-python/python/icechunk/__init__.py index 6c1359f07..3dab0b8f0 100644 --- a/icechunk-python/python/icechunk/__init__.py +++ b/icechunk-python/python/icechunk/__init__.py @@ -20,6 +20,7 @@ GCSummary, IcechunkError, ManifestConfig, + ManifestFileInfo, ManifestPreloadCondition, ManifestPreloadConfig, ObjectStoreConfig, @@ -104,6 +105,7 @@ "IcechunkError", "IcechunkStore", "ManifestConfig", + "ManifestFileInfo", "ManifestPreloadCondition", "ManifestPreloadConfig", "ObjectStoreConfig", diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi index 174e9c974..78aca5609 100644 --- a/icechunk-python/python/icechunk/_icechunk_python.pyi +++ b/icechunk-python/python/icechunk/_icechunk_python.pyi @@ -1097,6 +1097,22 @@ class PyAsyncStringGenerator(AsyncGenerator[str, None], metaclass=abc.ABCMeta): def __aiter__(self) -> PyAsyncStringGenerator: ... async def __anext__(self) -> str: ... +class ManifestFileInfo: + """Manifest file metadata""" + + @property + def id(self) -> str: + """The manifest id""" + ... + @property + def size_bytes(self) -> int: + """The size in bytes of the""" + ... + @property + def num_chunk_refs(self) -> int: + """The number of chunk references contained in this manifest""" + ... + class SnapshotInfo: """Metadata for a snapshot""" @property @@ -1125,6 +1141,12 @@ class SnapshotInfo: The metadata of the snapshot """ ... + @property + def manifests(self) -> list[ManifestFileInfo]: + """ + The manifests linked to this snapshot + """ + ... class PyAsyncSnapshotGenerator(AsyncGenerator[SnapshotInfo, None], metaclass=abc.ABCMeta): def __aiter__(self) -> PyAsyncSnapshotGenerator: ... diff --git a/icechunk-python/src/lib.rs b/icechunk-python/src/lib.rs index 02d9c394f..7072c29a5 100644 --- a/icechunk-python/src/lib.rs +++ b/icechunk-python/src/lib.rs @@ -28,7 +28,7 @@ use errors::{ use icechunk::{format::format_constants::SpecVersionBin, initialize_tracing}; use pyo3::prelude::*; use pyo3::wrap_pyfunction; -use repository::{PyDiff, PyGCSummary, PyRepository, PySnapshotInfo}; +use repository::{PyDiff, PyGCSummary, PyManifestFileInfo, PyRepository, PySnapshotInfo}; use session::PySession; use store::{PyStore, VirtualChunkSpec}; @@ -93,6 +93,7 @@ fn _icechunk_python(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/icechunk-python/src/repository.rs b/icechunk-python/src/repository.rs index 665ee1264..9fad02de0 100644 --- a/icechunk-python/src/repository.rs +++ b/icechunk-python/src/repository.rs @@ -13,7 +13,7 @@ use icechunk::{ config::Credentials, format::{ SnapshotId, - snapshot::{SnapshotInfo, SnapshotProperties}, + snapshot::{ManifestFileInfo, SnapshotInfo, SnapshotProperties}, transaction_log::Diff, }, ops::{ @@ -61,6 +61,16 @@ pub struct PySnapshotInfo { message: String, #[pyo3(get)] metadata: PySnapshotProperties, + #[pyo3(get)] + manifests: Vec, +} + +#[pyclass(name = "ManifestFileInfo", eq)] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct PyManifestFileInfo { + pub id: String, + pub size_bytes: u64, + pub num_chunk_refs: u32, } impl<'py> FromPyObject<'py> for PySnapshotProperties { @@ -161,18 +171,41 @@ impl From for SnapshotProperties { } } +impl From for PyManifestFileInfo { + fn from(val: ManifestFileInfo) -> Self { + Self { + id: val.id.to_string(), + size_bytes: val.size_bytes, + num_chunk_refs: val.num_chunk_refs, + } + } +} + impl From for PySnapshotInfo { fn from(val: SnapshotInfo) -> Self { - PySnapshotInfo { + Self { id: val.id.to_string(), parent_id: val.parent_id.map(|id| id.to_string()), written_at: val.flushed_at, message: val.message, metadata: val.metadata.into(), + manifests: val.manifests.into_iter().map(|v| v.into()).collect(), } } } +#[pymethods] +impl PyManifestFileInfo { + pub fn __repr__(&self) -> String { + format!( + r#"ManifestFileInfo(id="{id}", size_bytes={size}, num_chunk_refs={chunks})"#, + id = self.id, + size = self.size_bytes, + chunks = self.num_chunk_refs, + ) + } +} + #[pymethods] impl PySnapshotInfo { pub fn __repr__(&self) -> String { diff --git a/icechunk-python/tests/test_timetravel.py b/icechunk-python/tests/test_timetravel.py index 7824ba14c..1948944e3 100644 --- a/icechunk-python/tests/test_timetravel.py +++ b/icechunk-python/tests/test_timetravel.py @@ -124,6 +124,7 @@ def test_timetravel() -> None: "commit 1", "Repository initialized", ] + assert [len(snap.manifests) for snap in parents] == [1, 1, 1, 0] assert sorted(parents, key=lambda p: p.written_at) == list(reversed(parents)) assert len(set([snap.id for snap in parents])) == 4 assert list(repo.ancestry(tag="v1.0")) == parents diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs index c6cb71e56..e8af0d7a8 100644 --- a/icechunk/src/format/manifest.rs +++ b/icechunk/src/format/manifest.rs @@ -155,11 +155,20 @@ pub struct ChunkInfo { pub payload: ChunkPayload, } -#[derive(Debug)] +#[derive(PartialEq)] pub struct Manifest { buffer: Vec, } +impl std::fmt::Debug for Manifest { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Manifest") + .field("id", &self.id()) + .field("chunks", &self.len()) + .finish_non_exhaustive() + } +} + impl Manifest { pub fn id(&self) -> ManifestId { ManifestId::new(self.root().id().0) diff --git a/icechunk/src/format/mod.rs b/icechunk/src/format/mod.rs index 451e8e8a1..1b96faf23 100644 --- a/icechunk/src/format/mod.rs +++ b/icechunk/src/format/mod.rs @@ -11,7 +11,6 @@ use ::flatbuffers::InvalidFlatbuffer; use bytes::Bytes; use flatbuffers::generated; use format_constants::FileTypeBin; -use itertools::Itertools; use manifest::{VirtualReferenceError, VirtualReferenceErrorKind}; use rand::{Rng, rng}; use serde::{Deserialize, Serialize}; @@ -108,7 +107,7 @@ impl ObjectId { impl fmt::Debug for ObjectId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{:02x}", self.0.iter().format("")) + write!(f, "{}", String::from(self)) } } diff --git a/icechunk/src/format/snapshot.rs b/icechunk/src/format/snapshot.rs index 2992a7900..d9c4cc5a9 100644 --- a/icechunk/src/format/snapshot.rs +++ b/icechunk/src/format/snapshot.rs @@ -251,11 +251,27 @@ impl ManifestFileInfo { } } -#[derive(Debug, PartialEq)] +#[derive(PartialEq)] pub struct Snapshot { buffer: Vec, } +impl std::fmt::Debug for Snapshot { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let nodes = + self.iter().map(|n| n.map(|n| n.path.to_string())).collect::>(); + f.debug_struct("Snapshot") + .field("id", &self.id()) + .field("parent_id", &self.parent_id()) + .field("flushed_at", &self.flushed_at()) + .field("nodes", &nodes) + .field("manifests", &self.manifest_files().collect::>()) + .field("message", &self.message()) + .field("metadata", &self.metadata()) + .finish_non_exhaustive() + } +} + #[derive(Debug, Clone, PartialEq, Eq)] pub struct SnapshotInfo { pub id: SnapshotId, @@ -263,6 +279,7 @@ pub struct SnapshotInfo { pub flushed_at: DateTime, pub message: String, pub metadata: SnapshotProperties, + pub manifests: Vec, } impl TryFrom<&Snapshot> for SnapshotInfo { @@ -275,6 +292,7 @@ impl TryFrom<&Snapshot> for SnapshotInfo { flushed_at: value.flushed_at()?, message: value.message().to_string(), metadata: value.metadata()?.clone(), + manifests: value.manifest_files().collect(), }) } } @@ -432,10 +450,6 @@ impl Snapshot { self.root().message().to_string() } - // pub fn nodes(&self) -> &BTreeMap { - // &self.nodes - // } - pub fn get_manifest_file(&self, id: &ManifestId) -> Option { self.root().manifest_files().iter().find(|mf| mf.id().0 == id.0.as_slice()).map( |mf| ManifestFileInfo {