Skip to content

Commit

Permalink
Remove some cruft from the on disk datastuctures (#647)
Browse files Browse the repository at this point in the history
This is a backwards incompatible change
  • Loading branch information
paraseba authored Jan 30, 2025
1 parent fc603b3 commit eeb8f7f
Show file tree
Hide file tree
Showing 52 changed files with 48 additions and 99 deletions.
32 changes: 16 additions & 16 deletions icechunk-python/tests/data/test-repo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,18 @@ compression: null
caching: null
storage: null
virtual_chunk_containers:
gcs:
name: gcs
url_prefix: gcs
store: !Gcs {}
s3:
name: s3
url_prefix: s3://
store: !S3Compatible
region: us-east-1
endpoint_url: http://localhost:9000
anonymous: false
allow_http: true
az:
name: az
url_prefix: az
store: !Azure {}
tigris:
name: tigris
url_prefix: tigris
Expand All @@ -17,19 +25,11 @@ virtual_chunk_containers:
endpoint_url: https://fly.storage.tigris.dev
anonymous: false
allow_http: false
az:
name: az
url_prefix: az
store: !Azure {}
gcs:
name: gcs
url_prefix: gcs
store: !Gcs {}
file:
name: file
url_prefix: file
store: !LocalFileSystem ''
s3:
name: s3
url_prefix: s3://
store: !S3Compatible
region: us-east-1
endpoint_url: http://localhost:9000
anonymous: false
allow_http: true
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"7WNXDEQFC5ATTVWGPNP0"}
{"snapshot":"FK0CX5JQH2DVDZ6PD6WG"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"HRRH2GHC3W746QRMH8M0"}
{"snapshot":"KCR7ES7JPCBY23X6MY3G"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"GKC3ZERQX8XJ59GRN2D0"}
{"snapshot":"QY5JG2BWG2VPPDJR4JE0"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"8540BKNK5108JA505M4G"}
{"snapshot":"VNPWJSZWB9G990XV1V8G"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"61WK4XSS6SWVWDM8JJ40"}
{"snapshot":"G0BR0G9NKT75ZZS7BWWG"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"H0RA0TB5YSFZ0ZKPB340"}
{"snapshot":"9W0W1DS2BKRV4MK2A2S0"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"7WNXDEQFC5ATTVWGPNP0"}
{"snapshot":"FK0CX5JQH2DVDZ6PD6WG"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"H0RA0TB5YSFZ0ZKPB340"}
{"snapshot":"9W0W1DS2BKRV4MK2A2S0"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"61WK4XSS6SWVWDM8JJ40"}
{"snapshot":"G0BR0G9NKT75ZZS7BWWG"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"H0RA0TB5YSFZ0ZKPB340"}
{"snapshot":"9W0W1DS2BKRV4MK2A2S0"}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion icechunk/src/asset_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,7 @@ async fn write_new_tx_log(

let buffer = tokio::task::spawn_blocking(move || {
let buffer = binary_file_header(
SpecVersionBin::V0_1_0Alpha12,
SpecVersionBin::current(),
FileTypeBin::TransactionLog,
CompressionAlgorithmBin::Zstd,
);
Expand Down
20 changes: 4 additions & 16 deletions icechunk/src/format/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ use serde::{Deserialize, Serialize};
use crate::storage::ETag;

use super::{
format_constants::SpecVersionBin, ChunkId, ChunkIndices, ChunkLength, ChunkOffset,
IcechunkFormatError, IcechunkFormatVersion, IcechunkResult, ManifestId, NodeId,
ChunkId, ChunkIndices, ChunkLength, ChunkOffset, IcechunkFormatError, IcechunkResult,
ManifestId, NodeId,
};

type ManifestExtents = Range<ChunkIndices>;
Expand Down Expand Up @@ -124,20 +124,13 @@ pub struct ChunkInfo {

#[derive(Debug, PartialEq)]
pub struct Manifest {
pub icechunk_manifest_format_version: IcechunkFormatVersion,
pub icechunk_manifest_format_flags: BTreeMap<String, rmpv::Value>,
pub id: ManifestId,
pub(crate) chunks: BTreeMap<NodeId, BTreeMap<ChunkIndices, ChunkPayload>>,
}

impl Default for Manifest {
fn default() -> Self {
Self {
icechunk_manifest_format_version: Default::default(),
icechunk_manifest_format_flags: Default::default(),
id: ManifestId::random(),
chunks: Default::default(),
}
Self { id: ManifestId::random(), chunks: Default::default() }
}
}

Expand All @@ -160,12 +153,7 @@ impl Manifest {
}

pub fn new(chunks: BTreeMap<NodeId, BTreeMap<ChunkIndices, ChunkPayload>>) -> Self {
Self {
chunks,
icechunk_manifest_format_version: SpecVersionBin::current() as u8,
icechunk_manifest_format_flags: Default::default(),
id: ManifestId::random(),
}
Self { chunks, id: ManifestId::random() }
}

pub async fn from_stream<E>(
Expand Down
10 changes: 3 additions & 7 deletions icechunk/src/format/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,6 @@ pub enum IcechunkFormatError {
InvalidCompressionAlgorithm, // TODO: add more info
}

pub type IcechunkFormatVersion = u8;

pub type IcechunkResult<T> = Result<T, IcechunkFormatError>;

pub mod format_constants {
Expand Down Expand Up @@ -273,25 +271,23 @@ pub mod format_constants {
#[repr(u8)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SpecVersionBin {
V0_1_0Alpha12 = 1u8,
V0dot1 = 1u8,
}

impl TryFrom<u8> for SpecVersionBin {
type Error = String;

fn try_from(value: u8) -> Result<Self, Self::Error> {
match value {
n if n == SpecVersionBin::V0_1_0Alpha12 as u8 => {
Ok(SpecVersionBin::V0_1_0Alpha12)
}
n if n == SpecVersionBin::V0dot1 as u8 => Ok(SpecVersionBin::V0dot1),
n => Err(format!("Bad spec version code: {}", n)),
}
}
}

impl SpecVersionBin {
pub fn current() -> Self {
Self::V0_1_0Alpha12
Self::V0dot1
}
}

Expand Down
26 changes: 3 additions & 23 deletions icechunk/src/format/serializers/current.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::format::{
AttributeFileInfo, ManifestFileInfo, NodeSnapshot, Snapshot, SnapshotProperties,
},
transaction_log::TransactionLog,
ChunkIndices, IcechunkFormatVersion, ManifestId, NodeId, Path, SnapshotId,
ChunkIndices, ManifestId, NodeId, Path, SnapshotId,
};

#[derive(Debug, Deserialize)]
Expand Down Expand Up @@ -68,45 +68,30 @@ impl<'a> From<&'a Snapshot> for SnapshotSerializer<'a> {

#[derive(Debug, Deserialize)]
pub struct ManifestDeserializer {
icechunk_manifest_format_version: IcechunkFormatVersion,
icechunk_manifest_format_flags: BTreeMap<String, rmpv::Value>,
id: ManifestId,
chunks: BTreeMap<NodeId, BTreeMap<ChunkIndices, ChunkPayload>>,
}

#[derive(Debug, Serialize)]
pub struct ManifestSerializer<'a> {
icechunk_manifest_format_version: &'a IcechunkFormatVersion,
icechunk_manifest_format_flags: &'a BTreeMap<String, rmpv::Value>,
id: &'a ManifestId,
chunks: &'a BTreeMap<NodeId, BTreeMap<ChunkIndices, ChunkPayload>>,
}

impl From<ManifestDeserializer> for Manifest {
fn from(value: ManifestDeserializer) -> Self {
Self {
icechunk_manifest_format_version: value.icechunk_manifest_format_version,
icechunk_manifest_format_flags: value.icechunk_manifest_format_flags,
id: value.id,
chunks: value.chunks,
}
Self { id: value.id, chunks: value.chunks }
}
}

impl<'a> From<&'a Manifest> for ManifestSerializer<'a> {
fn from(value: &'a Manifest) -> Self {
Self {
icechunk_manifest_format_version: &value.icechunk_manifest_format_version,
icechunk_manifest_format_flags: &value.icechunk_manifest_format_flags,
id: &value.id,
chunks: &value.chunks,
}
Self { id: &value.id, chunks: &value.chunks }
}
}

#[derive(Debug, Deserialize)]
pub struct TransactionLogDeserializer {
icechunk_transaction_log_format_version: IcechunkFormatVersion,
new_groups: HashSet<NodeId>,
new_arrays: HashSet<NodeId>,
deleted_groups: HashSet<NodeId>,
Expand All @@ -118,7 +103,6 @@ pub struct TransactionLogDeserializer {

#[derive(Debug, Serialize)]
pub struct TransactionLogSerializer<'a> {
icechunk_transaction_log_format_version: &'a IcechunkFormatVersion,
new_groups: &'a HashSet<NodeId>,
new_arrays: &'a HashSet<NodeId>,
deleted_groups: &'a HashSet<NodeId>,
Expand All @@ -131,8 +115,6 @@ pub struct TransactionLogSerializer<'a> {
impl From<TransactionLogDeserializer> for TransactionLog {
fn from(value: TransactionLogDeserializer) -> Self {
Self {
icechunk_transaction_log_format_version: value
.icechunk_transaction_log_format_version,
new_groups: value.new_groups,
new_arrays: value.new_arrays,
deleted_groups: value.deleted_groups,
Expand All @@ -147,8 +129,6 @@ impl From<TransactionLogDeserializer> for TransactionLog {
impl<'a> From<&'a TransactionLog> for TransactionLogSerializer<'a> {
fn from(value: &'a TransactionLog) -> Self {
Self {
icechunk_transaction_log_format_version: &value
.icechunk_transaction_log_format_version,
new_groups: &value.new_groups,
new_arrays: &value.new_arrays,
deleted_groups: &value.deleted_groups,
Expand Down
12 changes: 6 additions & 6 deletions icechunk/src/format/serializers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ pub fn serialize_snapshot(
write: &mut impl Write,
) -> Result<(), rmp_serde::encode::Error> {
match version {
SpecVersionBin::V0_1_0Alpha12 => {
SpecVersionBin::V0dot1 => {
let serializer = SnapshotSerializer::from(snapshot);
rmp_serde::encode::write(write, &serializer)
}
Expand All @@ -73,7 +73,7 @@ pub fn serialize_manifest(
write: &mut impl Write,
) -> Result<(), rmp_serde::encode::Error> {
match version {
SpecVersionBin::V0_1_0Alpha12 => {
SpecVersionBin::V0dot1 => {
let serializer = ManifestSerializer::from(manifest);
rmp_serde::encode::write(write, &serializer)
}
Expand All @@ -86,7 +86,7 @@ pub fn serialize_transaction_log(
write: &mut impl Write,
) -> Result<(), rmp_serde::encode::Error> {
match version {
SpecVersionBin::V0_1_0Alpha12 => {
SpecVersionBin::V0dot1 => {
let serializer = TransactionLogSerializer::from(transaction_log);
rmp_serde::encode::write(write, &serializer)
}
Expand All @@ -98,7 +98,7 @@ pub fn deserialize_snapshot(
read: Box<dyn Read>,
) -> Result<Snapshot, rmp_serde::decode::Error> {
match version {
SpecVersionBin::V0_1_0Alpha12 => {
SpecVersionBin::V0dot1 => {
let deserializer: SnapshotDeserializer = rmp_serde::from_read(read)?;
Ok(deserializer.into())
}
Expand All @@ -110,7 +110,7 @@ pub fn deserialize_manifest(
read: Box<dyn Read>,
) -> Result<Manifest, rmp_serde::decode::Error> {
match version {
SpecVersionBin::V0_1_0Alpha12 => {
SpecVersionBin::V0dot1 => {
let deserializer: ManifestDeserializer = rmp_serde::from_read(read)?;
Ok(deserializer.into())
}
Expand All @@ -122,7 +122,7 @@ pub fn deserialize_transaction_log(
read: Box<dyn Read>,
) -> Result<TransactionLog, rmp_serde::decode::Error> {
match version {
SpecVersionBin::V0_1_0Alpha12 => {
SpecVersionBin::V0dot1 => {
let deserializer: TransactionLogDeserializer = rmp_serde::from_read(read)?;
Ok(deserializer.into())
}
Expand Down
20 changes: 4 additions & 16 deletions icechunk/src/format/snapshot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,9 @@ use crate::metadata::{
};

use super::{
format_constants::SpecVersionBin,
manifest::{Manifest, ManifestRef},
AttributesId, ChunkIndices, IcechunkFormatError, IcechunkFormatVersion,
IcechunkResult, ManifestId, NodeId, Path, SnapshotId, TableOffset,
AttributesId, ChunkIndices, IcechunkFormatError, IcechunkResult, ManifestId, NodeId,
Path, SnapshotId, TableOffset,
};

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
Expand Down Expand Up @@ -126,26 +125,19 @@ pub type SnapshotProperties = HashMap<String, Value>;
#[derive(Debug, PartialEq, Serialize, Deserialize, Clone, Eq, Hash)]
pub struct ManifestFileInfo {
pub id: ManifestId,
pub format_version: IcechunkFormatVersion,
pub size_bytes: u64,
pub num_rows: u32,
}

impl ManifestFileInfo {
pub fn new(manifest: &Manifest, size_bytes: u64) -> Self {
Self {
id: manifest.id.clone(),
format_version: SpecVersionBin::current() as u8,
num_rows: manifest.len() as u32,
size_bytes,
}
Self { id: manifest.id.clone(), num_rows: manifest.len() as u32, size_bytes }
}
}

#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
pub struct AttributeFileInfo {
pub id: AttributesId,
pub format_version: IcechunkFormatVersion,
}

#[derive(Debug, PartialEq)]
Expand Down Expand Up @@ -381,9 +373,7 @@ impl Iterator for NodeIterator {
#[cfg(test)]
#[allow(clippy::panic, clippy::unwrap_used, clippy::expect_used)]
mod tests {
use crate::format::{
format_constants::SpecVersionBin, IcechunkFormatError, ObjectId,
};
use crate::format::{IcechunkFormatError, ObjectId};

use super::*;
use pretty_assertions::assert_eq;
Expand Down Expand Up @@ -502,13 +492,11 @@ mod tests {
let manifests = vec![
ManifestFileInfo {
id: man_ref1.object_id.clone(),
format_version: SpecVersionBin::current() as u8,
size_bytes: 1_000_000,
num_rows: 100_000,
},
ManifestFileInfo {
id: man_ref2.object_id.clone(),
format_version: SpecVersionBin::current() as u8,
size_bytes: 1_000_000,
num_rows: 100_000,
},
Expand Down
Loading

0 comments on commit eeb8f7f

Please sign in to comment.