Skip to content

Commit d6fbf02

Browse files
authored
Uncache history (#635)
We had a cache in each snapshot with pointers to all the snapshot ancestry. This makes expiration harder, because we need to invalidate these caches. We are replacing the cache by a single `parent_id` link. In the future, we may optimize this for performance. Notice: python is still turning the stream of parents into a python list. This is now much worse because the new ancestry version is much slower. We'll need to change this to use an AsyncGenerator.
1 parent 0ea64bf commit d6fbf02

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+260
-194
lines changed

docs/docs/scripts/readthedocs.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@ ${ config.versions.active.map(
2121
</div>`;
2222

2323
document.querySelector(".md-header__topic").insertAdjacentHTML("beforeend", versioning);
24-
});
24+
});

icechunk-python/python/icechunk/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
S3Credentials,
2323
S3Options,
2424
S3StaticCredentials,
25-
SnapshotMetadata,
25+
SnapshotInfo,
2626
Storage,
2727
StorageConcurrencySettings,
2828
StorageSettings,
@@ -98,7 +98,7 @@
9898
"S3Options",
9999
"S3StaticCredentials",
100100
"Session",
101-
"SnapshotMetadata",
101+
"SnapshotInfo",
102102
"Storage",
103103
"StorageConcurrencySettings",
104104
"StorageSettings",

icechunk-python/python/icechunk/_icechunk_python.pyi

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ class PyRepository:
219219
branch: str | None = None,
220220
tag: str | None = None,
221221
snapshot: str | None = None,
222-
) -> list[SnapshotMetadata]: ...
222+
) -> list[SnapshotInfo]: ...
223223
def create_branch(self, branch: str, snapshot_id: str) -> None: ...
224224
def list_branches(self) -> set[str]: ...
225225
def lookup_branch(self, branch: str) -> str: ...
@@ -311,13 +311,17 @@ class PyAsyncStringGenerator(AsyncGenerator[str, None], metaclass=abc.ABCMeta):
311311
def __aiter__(self) -> PyAsyncStringGenerator: ...
312312
async def __anext__(self) -> str: ...
313313

314-
class SnapshotMetadata:
314+
class SnapshotInfo:
315315
"""Metadata for a snapshot"""
316316
@property
317317
def id(self) -> str:
318318
"""The snapshot ID"""
319319
...
320320
@property
321+
def parent_id(self) -> str | None:
322+
"""The snapshot ID"""
323+
...
324+
@property
321325
def written_at(self) -> datetime.datetime:
322326
"""
323327
The timestamp when the snapshot was written
@@ -330,11 +334,9 @@ class SnapshotMetadata:
330334
"""
331335
...
332336

333-
class PyAsyncSnapshotGenerator(
334-
AsyncGenerator[SnapshotMetadata, None], metaclass=abc.ABCMeta
335-
):
337+
class PyAsyncSnapshotGenerator(AsyncGenerator[SnapshotInfo, None], metaclass=abc.ABCMeta):
336338
def __aiter__(self) -> PyAsyncSnapshotGenerator: ...
337-
async def __anext__(self) -> SnapshotMetadata: ...
339+
async def __anext__(self) -> SnapshotInfo: ...
338340

339341
class S3StaticCredentials:
340342
access_key_id: str

icechunk-python/python/icechunk/repository.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from icechunk._icechunk_python import (
44
PyRepository,
55
RepositoryConfig,
6-
SnapshotMetadata,
6+
SnapshotInfo,
77
Storage,
88
)
99
from icechunk.credentials import AnyCredential
@@ -181,7 +181,7 @@ def ancestry(
181181
branch: str | None = None,
182182
tag: str | None = None,
183183
snapshot: str | None = None,
184-
) -> list[SnapshotMetadata]:
184+
) -> list[SnapshotInfo]:
185185
"""
186186
Get the ancestry of a snapshot.
187187
@@ -196,7 +196,7 @@ def ancestry(
196196
197197
Returns
198198
-------
199-
list[SnapshotMetadata]
199+
list[SnapshotInfo]
200200
The ancestry of the snapshot, listing out the snapshots and their metadata.
201201
202202
Notes

icechunk-python/src/config.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ fn format_option<'a, T: AsRef<str> + 'a>(o: Option<T>) -> String {
106106
}
107107
}
108108

109-
fn format_option_string<'a, T: AsRef<str> + 'a>(o: Option<T>) -> String {
109+
pub(crate) fn format_option_string<'a, T: AsRef<str> + 'a>(o: Option<T>) -> String {
110110
match o.as_ref() {
111111
None => "None".to_string(),
112112
Some(s) => format!(r#""{}""#, s.as_ref()),

icechunk-python/src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use errors::{
2222
PyRebaseFailedError,
2323
};
2424
use pyo3::prelude::*;
25-
use repository::{PyRepository, PySnapshotMetadata};
25+
use repository::{PyRepository, PySnapshotInfo};
2626
use session::PySession;
2727
use store::PyStore;
2828

@@ -34,7 +34,7 @@ fn _icechunk_python(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
3434
m.add_class::<PyRepositoryConfig>()?;
3535
m.add_class::<PySession>()?;
3636
m.add_class::<PyStore>()?;
37-
m.add_class::<PySnapshotMetadata>()?;
37+
m.add_class::<PySnapshotInfo>()?;
3838
m.add_class::<PyConflictSolver>()?;
3939
m.add_class::<PyBasicConflictSolver>()?;
4040
m.add_class::<PyConflictDetector>()?;

icechunk-python/src/repository.rs

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use chrono::{DateTime, Utc};
77
use futures::TryStreamExt;
88
use icechunk::{
99
config::Credentials,
10-
format::{snapshot::SnapshotMetadata, SnapshotId},
10+
format::{snapshot::SnapshotInfo, SnapshotId},
1111
repository::{RepositoryError, VersionInfo},
1212
Repository,
1313
};
@@ -16,40 +16,45 @@ use tokio::sync::RwLock;
1616

1717
use crate::{
1818
config::{
19-
datetime_repr, PyCredentials, PyRepositoryConfig, PyStorage, PyStorageSettings,
19+
datetime_repr, format_option_string, PyCredentials, PyRepositoryConfig,
20+
PyStorage, PyStorageSettings,
2021
},
2122
errors::PyIcechunkStoreError,
2223
session::PySession,
2324
};
2425

25-
#[pyclass(name = "SnapshotMetadata", eq)]
26+
#[pyclass(name = "SnapshotInfo", eq)]
2627
#[derive(Clone, Debug, PartialEq, Eq)]
27-
pub struct PySnapshotMetadata {
28+
pub struct PySnapshotInfo {
2829
#[pyo3(get)]
2930
id: String,
3031
#[pyo3(get)]
32+
parent_id: Option<String>,
33+
#[pyo3(get)]
3134
written_at: DateTime<Utc>,
3235
#[pyo3(get)]
3336
message: String,
3437
}
3538

36-
impl From<SnapshotMetadata> for PySnapshotMetadata {
37-
fn from(val: SnapshotMetadata) -> Self {
38-
PySnapshotMetadata {
39+
impl From<SnapshotInfo> for PySnapshotInfo {
40+
fn from(val: SnapshotInfo) -> Self {
41+
PySnapshotInfo {
3942
id: val.id.to_string(),
40-
written_at: val.written_at,
43+
parent_id: val.parent_id.map(|id| id.to_string()),
44+
written_at: val.flushed_at,
4145
message: val.message,
4246
}
4347
}
4448
}
4549

4650
#[pymethods]
47-
impl PySnapshotMetadata {
51+
impl PySnapshotInfo {
4852
pub fn __repr__(&self) -> String {
4953
// TODO: escape
5054
format!(
51-
r#"SnapshotMetadata(id="{id}",written_at={at},message="{message}")"#,
55+
r#"SnapshotInfo(id="{id}", parent_id={parent}, written_at={at}, message="{message}")"#,
5256
id = self.id,
57+
parent = format_option_string(self.parent_id.as_ref()),
5358
at = datetime_repr(&self.written_at),
5459
message = self.message.chars().take(10).collect::<String>() + "...",
5560
)
@@ -225,7 +230,7 @@ impl PyRepository {
225230
branch: Option<String>,
226231
tag: Option<String>,
227232
snapshot: Option<String>,
228-
) -> PyResult<Vec<PySnapshotMetadata>> {
233+
) -> PyResult<Vec<PySnapshotInfo>> {
229234
// This function calls block_on, so we need to allow other thread python to make progress
230235
py.allow_threads(move || {
231236
let version = args_to_version_info(branch, tag, snapshot)?;
@@ -237,7 +242,7 @@ impl PyRepository {
237242
.ancestry(&version)
238243
.await
239244
.map_err(PyIcechunkStoreError::RepositoryError)?
240-
.map_ok(Into::<PySnapshotMetadata>::into)
245+
.map_ok(Into::<PySnapshotInfo>::into)
241246
.try_collect::<Vec<_>>()
242247
.await
243248
.map_err(PyIcechunkStoreError::RepositoryError)?;

0 commit comments

Comments
 (0)