Skip to content

Commit

Permalink
Implement manifest preloading
Browse files Browse the repository at this point in the history
This creates a pretty advanced mechanism to configure what manifest
should be preloaded on session creation.

It can match arrays by name, path or size; and it can create complex
conditions using `or` and `and` combinators.

By default, we preload manifests that have not more than 1k refs,
up to a total of 10k refs, and corresponding to arrays with names
matching any of the following regexes:


* `\bt\b|(time|min|hour|day|week|month|year)[0-9]*`
* `(z|nav_lev|gdep|lv_|[o]*lev|bottom_top|sigma|h(ei)?ght|altitude|depth|isobaric|pres|isotherm)[a-z_]*[0-9]*`
* `y|j|nlat|rlat|nj`
* `y?(nav_lat|lat|gphi)[a-z0-9]*`
* `x?(nav_lon|lon|glam)[a-z0-9]*`
* `x|i|nlon|rlon|ni`


The regex were taken from:

https://github.com/xarray-contrib/cf-xarray/blob/1591ff5ea7664a6bdef24055ef75e242cd5bfc8b/cf_xarray/criteria.py#L149-L160
  • Loading branch information
paraseba committed Jan 30, 2025
1 parent eeb8f7f commit 1a5f17e
Show file tree
Hide file tree
Showing 6 changed files with 400 additions and 12 deletions.
37 changes: 35 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions icechunk-python/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,8 @@ impl From<&PyRepositoryConfig> for RepositoryConfig {
virtual_chunk_containers: value.virtual_chunk_containers.as_ref().map(|c| {
c.iter().map(|(name, cont)| (name.clone(), cont.into())).collect()
}),
// FIXME: implement manifest preloading configuration in python
manifest: None,
})
}
}
Expand Down
1 change: 1 addition & 0 deletions icechunk/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ typetag = "0.2.19"
zstd = "0.13.2"
tokio-util = { version = "0.7.13", features = ["compat", "io-util"] }
serde_bytes = "0.11.15"
regex = "1.11.1"

[dev-dependencies]
pretty_assertions = "1.4.1"
Expand Down
109 changes: 109 additions & 0 deletions icechunk/src/config.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use core::fmt;
use std::{
collections::HashMap,
ops::Bound,
path::PathBuf,
sync::{Arc, OnceLock},
};
Expand Down Expand Up @@ -103,6 +104,99 @@ impl CachingConfig {
}
}

#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)]
pub enum ManifestPreloadCondition {
Or(Vec<ManifestPreloadCondition>),
And(Vec<ManifestPreloadCondition>),
PathMatches { regex: String },
NameMatches { regex: String },
NumRefs { from: Bound<u32>, to: Bound<u32> },
True,
False,
}

#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Default)]
pub struct ManifestPreloadConfig {
pub max_total_refs: Option<u32>,
pub preload_if: Option<ManifestPreloadCondition>,
}

impl ManifestPreloadConfig {
pub fn merge(&self, other: Self) -> Self {
Self {
max_total_refs: other.max_total_refs.or(self.max_total_refs),
preload_if: other.preload_if.or(self.preload_if.clone()),
}
}

pub fn max_total_refs(&self) -> u32 {
self.max_total_refs.unwrap_or(10_000)
}

pub fn preload_if(&self) -> &ManifestPreloadCondition {
self.preload_if.as_ref().unwrap_or_else(|| {
DEFAULT_MANIFEST_PRELOAD_CONDITION.get_or_init(|| {
ManifestPreloadCondition::And(vec![
ManifestPreloadCondition::Or(vec![
// regexes taken from https://github.com/xarray-contrib/cf-xarray/blob/1591ff5ea7664a6bdef24055ef75e242cd5bfc8b/cf_xarray/criteria.py#L149-L160
ManifestPreloadCondition::NameMatches {
// time
regex: r#"\bt\b|(time|min|hour|day|week|month|year)[0-9]*"#.to_string(), // codespell:ignore
},
ManifestPreloadCondition::NameMatches {
// Z
regex: r#"(z|nav_lev|gdep|lv_|[o]*lev|bottom_top|sigma|h(ei)?ght|altitude|depth|isobaric|pres|isotherm)[a-z_]*[0-9]*"#.to_string(), // codespell:ignore

},
ManifestPreloadCondition::NameMatches {
// Y
regex: r#"y|j|nlat|rlat|nj"#.to_string(), // codespell:ignore
},
ManifestPreloadCondition::NameMatches {
// latitude
regex: r#"y?(nav_lat|lat|gphi)[a-z0-9]*"#.to_string(), // codespell:ignore
},
ManifestPreloadCondition::NameMatches {
// longitude
regex: r#"x?(nav_lon|lon|glam)[a-z0-9]*"#.to_string(), // codespell:ignore
},
ManifestPreloadCondition::NameMatches {
// X
regex: r#"x|i|nlon|rlon|ni"#.to_string(), // codespell:ignore
},
]),
ManifestPreloadCondition::NumRefs {
from: Bound::Unbounded,
to: Bound::Included(1000),
},
])
})
})
}
}

static DEFAULT_MANIFEST_PRELOAD_CONDITION: OnceLock<ManifestPreloadCondition> =
OnceLock::new();

#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Default)]
pub struct ManifestConfig {
pub preload: Option<ManifestPreloadConfig>,
}

static DEFAULT_MANIFEST_PRELOAD_CONFIG: OnceLock<ManifestPreloadConfig> = OnceLock::new();

impl ManifestConfig {
pub fn merge(&self, other: Self) -> Self {
Self { preload: other.preload.or(self.preload.clone()) }
}

pub fn preload(&self) -> &ManifestPreloadConfig {
self.preload.as_ref().unwrap_or_else(|| {
DEFAULT_MANIFEST_PRELOAD_CONFIG.get_or_init(ManifestPreloadConfig::default)
})
}
}

#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default)]
pub struct RepositoryConfig {
/// Chunks smaller than this will be stored inline in the manifst
Expand All @@ -123,13 +217,16 @@ pub struct RepositoryConfig {
pub storage: Option<storage::Settings>,

pub virtual_chunk_containers: Option<HashMap<ContainerName, VirtualChunkContainer>>,

pub manifest: Option<ManifestConfig>,
}

static DEFAULT_COMPRESSION: OnceLock<CompressionConfig> = OnceLock::new();
static DEFAULT_CACHING: OnceLock<CachingConfig> = OnceLock::new();
static DEFAULT_VIRTUAL_CHUNK_CONTAINERS: OnceLock<
HashMap<ContainerName, VirtualChunkContainer>,
> = OnceLock::new();
static DEFAULT_MANIFEST_CONFIG: OnceLock<ManifestConfig> = OnceLock::new();

impl RepositoryConfig {
pub fn inline_chunk_threshold_bytes(&self) -> u16 {
Expand All @@ -156,6 +253,12 @@ impl RepositoryConfig {
self.storage.as_ref()
}

pub fn manifest(&self) -> &ManifestConfig {
self.manifest.as_ref().unwrap_or_else(|| {
DEFAULT_MANIFEST_CONFIG.get_or_init(ManifestConfig::default)
})
}

pub fn merge(&self, other: Self) -> Self {
Self {
inline_chunk_threshold_bytes: other
Expand Down Expand Up @@ -198,6 +301,12 @@ impl RepositoryConfig {
Some(merged)
}
},
manifest: match (&self.manifest, other.manifest) {
(None, None) => None,
(None, Some(c)) => Some(c),
(Some(c), None) => Some(c.clone()),
(Some(mine), Some(theirs)) => Some(mine.merge(theirs)),
},
}
}
}
Expand Down
4 changes: 4 additions & 0 deletions icechunk/src/format/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,10 @@ impl Path {
pub fn ancestors(&self) -> impl Iterator<Item = Path> + '_ {
self.0.ancestors().map(|p| Path(p.to_owned()))
}

pub fn name(&self) -> Option<&str> {
self.0.file_name()
}
}

impl TryFrom<&str> for Path {
Expand Down
Loading

0 comments on commit 1a5f17e

Please sign in to comment.