diff --git a/blobby/Cargo.toml b/blobby/Cargo.toml index 8df2f91c..1a612c45 100644 --- a/blobby/Cargo.toml +++ b/blobby/Cargo.toml @@ -10,3 +10,6 @@ categories = ["no-std"] edition = "2024" rust-version = "1.85" readme = "README.md" + +[features] +alloc = [] diff --git a/blobby/README.md b/blobby/README.md index b6efb3b1..bbe8e07a 100644 --- a/blobby/README.md +++ b/blobby/README.md @@ -11,29 +11,47 @@ Iterators over a simple binary blob storage. ## Examples ``` -let buf = b"\x02\x05hello\x06world!\x01\x02 \x00\x03\x06:::\x03\x01\x00"; -let mut v = blobby::BlobIterator::new(buf).unwrap(); -assert_eq!(v.next(), Some(Ok(&b"hello"[..]))); -assert_eq!(v.next(), Some(Ok(&b" "[..]))); -assert_eq!(v.next(), Some(Ok(&b""[..]))); -assert_eq!(v.next(), Some(Ok(&b"world!"[..]))); -assert_eq!(v.next(), Some(Ok(&b":::"[..]))); -assert_eq!(v.next(), Some(Ok(&b"world!"[..]))); -assert_eq!(v.next(), Some(Ok(&b"hello"[..]))); -assert_eq!(v.next(), Some(Ok(&b""[..]))); -assert_eq!(v.next(), None); - -let mut v = blobby::Blob2Iterator::new(buf).unwrap(); -assert_eq!(v.next(), Some(Ok([&b"hello"[..], b" "]))); -assert_eq!(v.next(), Some(Ok([&b""[..], b"world!"]))); -assert_eq!(v.next(), Some(Ok([&b":::"[..], b"world!"]))); -assert_eq!(v.next(), Some(Ok([&b"hello"[..], b""]))); -assert_eq!(v.next(), None); - -let mut v = blobby::Blob4Iterator::new(buf).unwrap(); -assert_eq!(v.next(), Some(Ok([&b"hello"[..], b" ", b"", b"world!"]))); -assert_eq!(v.next(), Some(Ok([&b":::"[..], b"world!", b"hello", b""]))); -assert_eq!(v.next(), None); +// We recommend to save blobby data into separate files and +// use the `include_bytes!` macro +static BLOBBY_DATA: &[u8] = b"\x02\x05hello\x06world!\x01\x02 \x00\x03\x06:::\x03\x01\x00"; + +static SLICE: &[&[u8]] = blobby::parse_into_slice!(BLOBBY_DATA); + +assert_eq!(SLICE[0], b"hello".as_slice()); +assert_eq!(SLICE[1], b" ".as_slice()); +assert_eq!(SLICE[2], b"".as_slice()); +assert_eq!(SLICE[3], b"world!".as_slice()); +assert_eq!(SLICE[4], b":::".as_slice()); +assert_eq!(SLICE[5], b"world!".as_slice()); +assert_eq!(SLICE[6], b"hello".as_slice()); +assert_eq!(SLICE[7], b"".as_slice()); +assert_eq!(SLICE.len(), 8); + +blobby::parse_into_structs!( + BLOBBY_DATA; + #[define_struct] + static ITEMS: &[Item { a, b, c, d }]; +); + +assert_eq!( + ITEMS[0], + Item { + a: b"hello", + b: b" ", + c: b"", + d: b"world!", + }, +); +assert_eq!( + ITEMS[1], + Item { + a: b":::", + b: b"world!", + c: b"hello", + d: b"", + }, +); +assert_eq!(ITEMS.len(), 2); ``` ## Encoding and decoding @@ -76,14 +94,14 @@ with `\n`). This file can be converted to the Blobby format by running the following command: ```sh -cargo run --releae --bin encode -- /path/to/input.txt /path/to/output.blb +cargo run --release --features alloc --bin encode -- /path/to/input.txt /path/to/output.blb ``` This will create a file which can be read using `blobby::Blob2Iterator`. To see contents of an existing Blobby file you can use the following command: ```sh -cargo run --releae --bin decode -- /path/to/input.blb /path/to/output.txt +cargo run --release --features alloc --bin decode -- /path/to/input.blb /path/to/output.txt ``` The output file will contain a sequence of hex-encoded byte strings stored in the input file. diff --git a/blobby/src/bin/decode.rs b/blobby/src/bin/decode.rs index ff4bebb8..1269bc01 100644 --- a/blobby/src/bin/decode.rs +++ b/blobby/src/bin/decode.rs @@ -1,41 +1,41 @@ //! Encoding utility -use blobby::BlobIterator; -use std::io::{self, BufRead, BufReader, BufWriter, Write}; -use std::{env, error::Error, fs::File}; - -fn encode_hex(data: &[u8]) -> String { - let mut res = String::with_capacity(2 * data.len()); - for &byte in data { - res.push_str(&format!("{byte:02X}")); - } - res +use std::error::Error; + +#[cfg(not(feature = "alloc"))] +fn main() -> Result<(), Box> { + Err("The decode binary should be compiled with enabled `alloc` feature!".into()) } -fn decode(mut reader: R, mut writer: W) -> io::Result { - let mut data = Vec::new(); - reader.read_to_end(&mut data)?; - let res = BlobIterator::new(&data) - .map_err(|e| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("invalid blobby data: {e:?}"), - ) - })? - .collect::>(); - for blob in res.iter() { - let blob = blob.map_err(|e| { +#[cfg(feature = "alloc")] +fn main() -> Result<(), Box> { + use std::io::{self, BufRead, BufReader, BufWriter, Write}; + use std::{env, fs::File}; + + fn encode_hex(data: &[u8]) -> String { + let mut res = String::with_capacity(2 * data.len()); + for &byte in data { + res.push_str(&format!("{byte:02X}")); + } + res + } + + fn decode(mut reader: R, mut writer: W) -> io::Result { + let mut data = Vec::new(); + reader.read_to_end(&mut data)?; + let res = blobby::parse_into_vec(&data).map_err(|e| { io::Error::new( io::ErrorKind::InvalidData, format!("invalid blobby data: {e:?}"), ) })?; - writer.write_all(encode_hex(blob).as_bytes())?; - writer.write_all(b"\n")?; + let len = res.len(); + for blob in res { + writer.write_all(encode_hex(blob).as_bytes())?; + writer.write_all(b"\n")?; + } + Ok(len) } - Ok(res.len()) -} -fn main() -> Result<(), Box> { let args: Vec = env::args().skip(1).collect(); if args.is_empty() { diff --git a/blobby/src/bin/encode.rs b/blobby/src/bin/encode.rs index c4686c58..2e0958d9 100644 --- a/blobby/src/bin/encode.rs +++ b/blobby/src/bin/encode.rs @@ -1,49 +1,57 @@ //! Encoding utility -use blobby::encode_blobs; -use std::io::{self, BufRead, BufReader, BufWriter, Write}; -use std::{env, error::Error, fs::File}; +use std::error::Error; -fn decode_hex_char(b: u8) -> io::Result { - let res = match b { - b'0'..=b'9' => b - b'0', - b'a'..=b'f' => b - b'a' + 10, - b'A'..=b'F' => b - b'A' + 10, - _ => { - let msg = "Invalid hex string: invalid byte {b}"; - return Err(io::Error::new(io::ErrorKind::InvalidData, msg)); - } - }; - Ok(res) +#[cfg(not(feature = "alloc"))] +fn main() -> Result<(), Box> { + Err("The encode binary should be compiled with enabled `alloc` feature!".into()) } -fn decode_hex(data: &str) -> io::Result> { - if data.len() % 2 != 0 { - let msg = "Invalid hex string: length is not even"; - return Err(io::Error::new(io::ErrorKind::InvalidData, msg)); +#[cfg(feature = "alloc")] +fn main() -> Result<(), Box> { + use blobby::encode_blobs; + use std::io::{self, BufRead, BufReader, BufWriter, Write}; + use std::{env, fs::File}; + + fn decode_hex_char(b: u8) -> io::Result { + let res = match b { + b'0'..=b'9' => b - b'0', + b'a'..=b'f' => b - b'a' + 10, + b'A'..=b'F' => b - b'A' + 10, + _ => { + let msg = "Invalid hex string: invalid byte {b}"; + return Err(io::Error::new(io::ErrorKind::InvalidData, msg)); + } + }; + Ok(res) } - data.as_bytes() - .chunks_exact(2) - .map(|chunk| { - let a = decode_hex_char(chunk[0])?; - let b = decode_hex_char(chunk[1])?; - Ok((a << 4) | b) - }) - .collect() -} -fn encode(reader: impl BufRead, mut writer: impl Write) -> io::Result { - let mut blobs = Vec::new(); - for line in reader.lines() { - let blob = decode_hex(&line?)?; - blobs.push(blob); + fn decode_hex(data: &str) -> io::Result> { + if data.len() % 2 != 0 { + let msg = "Invalid hex string: length is not even"; + return Err(io::Error::new(io::ErrorKind::InvalidData, msg)); + } + data.as_bytes() + .chunks_exact(2) + .map(|chunk| { + let a = decode_hex_char(chunk[0])?; + let b = decode_hex_char(chunk[1])?; + Ok((a << 4) | b) + }) + .collect() + } + + fn encode(reader: impl BufRead, mut writer: impl Write) -> io::Result { + let mut blobs = Vec::new(); + for line in reader.lines() { + let blob = decode_hex(&line?)?; + blobs.push(blob); + } + let (data, idx_len) = encode_blobs(&blobs); + println!("Index len: {idx_len:?}"); + writer.write_all(&data)?; + Ok(blobs.len()) } - let (data, idx_len) = encode_blobs(&blobs); - println!("Index len: {idx_len:?}"); - writer.write_all(&data)?; - Ok(blobs.len()) -} -fn main() -> Result<(), Box> { let args: Vec = env::args().skip(1).collect(); if args.is_empty() { diff --git a/blobby/src/decode.rs b/blobby/src/decode.rs new file mode 100644 index 00000000..707b0fb6 --- /dev/null +++ b/blobby/src/decode.rs @@ -0,0 +1,285 @@ +use super::{Error, NEXT_MASK, VAL_MASK}; + +pub(crate) const fn read_vlq(data: &mut &[u8]) -> Result { + let b = match data.split_first() { + Some((&b, rest)) => { + *data = rest; + b + } + None => return Err(Error::UnexpectedEnd), + }; + let mut next = b & NEXT_MASK; + let mut val = (b & VAL_MASK) as usize; + + macro_rules! step { + () => { + if next == 0 { + return Ok(val); + } + let b = match data.split_first() { + Some((&b, rest)) => { + *data = rest; + b + } + None => return Err(Error::UnexpectedEnd), + }; + + next = b & NEXT_MASK; + let t = (b & VAL_MASK) as usize; + val = ((val + 1) << 7) + t; + }; + } + + step!(); + step!(); + step!(); + + if next != 0 { + return Err(Error::InvalidVlq); + } + + Ok(val) +} + +macro_rules! try_read_vlq { + ($data:expr) => { + match read_vlq(&mut $data) { + Ok(v) => v, + Err(err) => return Err(err), + } + }; +} + +pub const fn parse_dedup_len(mut data: &[u8]) -> Result { + read_vlq(&mut data) +} + +pub const fn parse_items_len(mut data: &[u8]) -> Result { + let dedup_index_len = try_read_vlq!(data); + + let mut i = 0; + while i < dedup_index_len { + let m = try_read_vlq!(data); + let split = data.split_at(m); + data = split.1; + i += 1; + } + + let mut i = 0; + loop { + if data.is_empty() { + return Ok(i); + } + let val = try_read_vlq!(data); + // the least significant bit is used as a flag + let is_ref = (val & 1) != 0; + let val = val >> 1; + if is_ref { + if val >= dedup_index_len { + return Err(Error::InvalidIndex); + } + } else { + if val > data.len() { + return Err(Error::UnexpectedEnd); + } + let split = data.split_at(val); + data = split.1; + }; + i += 1; + } +} + +/// Parse blobby data into an array. +pub const fn parse_into_array( + mut data: &[u8], +) -> Result<[&[u8]; ITEMS], Error> { + if try_read_vlq!(data) != DEDUP_LEN { + return Err(Error::BadArrayLen); + } + + let mut dedup_index: [&[u8]; DEDUP_LEN] = [&[]; DEDUP_LEN]; + + let mut i = 0; + while i < dedup_index.len() { + let m = try_read_vlq!(data); + let split = data.split_at(m); + dedup_index[i] = split.0; + data = split.1; + i += 1; + } + + let mut res: [&[u8]; ITEMS] = [&[]; ITEMS]; + + let mut i = 0; + while i < res.len() { + let val = try_read_vlq!(data); + // the least significant bit is used as a flag + let is_ref = (val & 1) != 0; + let val = val >> 1; + res[i] = if is_ref { + if val >= dedup_index.len() { + return Err(Error::InvalidIndex); + } + dedup_index[val] + } else { + if val > data.len() { + return Err(Error::UnexpectedEnd); + } + let split = data.split_at(val); + data = split.1; + split.0 + }; + i += 1; + } + + if data.is_empty() { + Ok(res) + } else { + Err(Error::BadArrayLen) + } +} + +/// Parse blobby data into a vector of slices. +#[cfg(feature = "alloc")] +pub fn parse_into_vec(mut data: &[u8]) -> Result, Error> { + use alloc::{vec, vec::Vec}; + + let dedup_len = try_read_vlq!(data); + + let mut dedup_index: Vec<&[u8]> = vec![&[]; dedup_len]; + + let mut i = 0; + while i < dedup_index.len() { + let m = try_read_vlq!(data); + let split = data.split_at(m); + dedup_index[i] = split.0; + data = split.1; + i += 1; + } + + let items_len = parse_items_len(data)?; + let mut res: Vec<&[u8]> = vec![&[]; items_len]; + + let mut i = 0; + while i < res.len() { + let val = try_read_vlq!(data); + // the least significant bit is used as a flag + let is_ref = (val & 1) != 0; + let val = val >> 1; + res[i] = if is_ref { + if val >= dedup_index.len() { + return Err(Error::InvalidIndex); + } + dedup_index[val] + } else { + if val > data.len() { + return Err(Error::UnexpectedEnd); + } + let split = data.split_at(val); + data = split.1; + split.0 + }; + i += 1; + } + + assert!(data.is_empty()); + Ok(res) +} + +#[macro_export] +macro_rules! parse_into_slice { + ($data:expr) => {{ + const ITEMS_LEN: usize = { + match $crate::parse_items_len($data) { + Ok(v) => v, + Err(_) => panic!("Failed to parse items len"), + } + }; + const DEDUP_LEN: usize = { + match $crate::parse_dedup_len($data) { + Ok(v) => v, + Err(_) => panic!("Failed to parse dedup len"), + } + }; + const ITEMS: [&[u8]; ITEMS_LEN] = { + match $crate::parse_into_array::($data) { + Ok(v) => v, + Err(_) => panic!("Failed to parse items"), + } + }; + ITEMS.as_slice() + }}; +} + +#[macro_export] +macro_rules! parse_into_structs { + ( + $data:expr; + #[define_struct] + $static_vis:vis static $items_name:ident: &[ + $ty_vis:vis $item:ident { $($field:ident),* $(,)? } + ]; + ) => { + #[derive(Debug, Clone, Copy, Eq, PartialEq)] + $ty_vis struct $item { + pub $($field : &'static [u8]),* + } + + $crate::parse_into_structs!( + $data; + $static_vis static $items_name: &[ + $item { $($field),* } + ]; + ); + }; + + ( + $data:expr; + $static_vis:vis static $items_name:ident: &[ + $item:ident { $($field:ident),* $(,)? } + ]; + ) => { + $static_vis static $items_name: &[$item] = { + const RAW_ITEMS: &[&[u8]] = $crate::parse_into_slice!($data); + + const fn get_struct(items: &mut &[&'static [u8]]) -> $item { + $item { + $($field: { + match items.split_first() { + Some((first, rest)) => { + *items = rest; + first + } + None => unreachable!(), + } + }),* + } + } + + const ITEM_FIELDS: usize = 0 $( + { + let $field: (); let _ = $field; + 1 + })*; + + const ITEMS_LEN: usize = if RAW_ITEMS.len() % ITEM_FIELDS == 0 { + RAW_ITEMS.len() / ITEM_FIELDS + } else { + panic!("Number of raw items is not multiple of number of fields in the struct"); + }; + + const ITEMS: [$item; ITEMS_LEN] = { + let mut res = [$item { $($field : &[]),* }; ITEMS_LEN]; + + let mut raw_items = RAW_ITEMS; + let mut i = 0; + while i < res.len() { + res[i] = get_struct(&mut raw_items); + i += 1; + } + res + }; + + ITEMS.as_slice() + }; + }; +} diff --git a/blobby/src/encode.rs b/blobby/src/encode.rs new file mode 100644 index 00000000..786dbf14 --- /dev/null +++ b/blobby/src/encode.rs @@ -0,0 +1,183 @@ +use super::{NEXT_MASK, VAL_MASK}; + +/// Write a git-flavoured VLQ value into `buf`. +/// +/// Returns the slice within `buf` that holds the value. +fn encode_vlq(mut val: usize, buf: &mut [u8; 4]) -> &[u8] { + macro_rules! step { + ($n:expr) => { + buf[$n] = if $n == 3 { + (val & (VAL_MASK as usize)) as u8 + } else { + val -= 1; + NEXT_MASK | (val & (VAL_MASK as usize)) as u8 + }; + val >>= 7; + if val == 0 { + return &buf[$n..]; + } + }; + } + + step!(3); + step!(2); + step!(1); + step!(0); + panic!("integer is too big") +} + +/// Encode the given collection of binary blobs in .blb format into `writer`. +/// Returns the encoded data together with a count of the number of blobs included in the index. +/// +/// The encoded file format is: +/// - count of index entries=N +/// - N x index entries, each encoded as: +/// - size L of index entry (VLQ) +/// - index blob contents (L bytes) +/// - repeating encoded blobs, each encoded as: +/// - VLQ value that is either: +/// - (J << 1) & 0x01: indicates this blob is index entry J +/// - (L << 1) & 0x00: indicates an explicit blob of len L +/// - (in the latter case) explicit blob contents (L bytes) +pub fn encode_blobs<'a, I, T>(blobs: &'a I) -> (alloc::vec::Vec, usize) +where + &'a I: IntoIterator, + T: AsRef<[u8]> + 'a, +{ + use alloc::{collections::BTreeMap, vec::Vec}; + + let mut idx_map = BTreeMap::new(); + blobs + .into_iter() + .map(|v| v.as_ref()) + .filter(|blob| !blob.is_empty()) + .for_each(|blob| { + let v = idx_map.entry(blob.as_ref()).or_insert(0); + *v += 1; + }); + + let mut idx: Vec<&[u8]> = idx_map + .iter() + .filter(|&(_, &v)| v > 1) + .map(|(&k, _)| k) + .collect(); + idx.sort_by_key(|e| { + let k = match e { + [0] => 2, + [1] => 1, + _ => 0, + }; + (k, idx_map.get(e).unwrap()) + }); + idx.reverse(); + let idx_len = idx.len(); + + let rev_idx: BTreeMap<&[u8], usize> = idx.iter().enumerate().map(|(i, &e)| (e, i)).collect(); + + let mut out_buf = Vec::new(); + let mut buf = [0u8; 4]; + out_buf.extend_from_slice(encode_vlq(idx.len(), &mut buf)); + for e in idx { + out_buf.extend_from_slice(encode_vlq(e.len(), &mut buf)); + out_buf.extend_from_slice(e); + } + + for blob in blobs.into_iter().map(|v| v.as_ref()) { + if let Some(dup_pos) = rev_idx.get(blob) { + let n = (dup_pos << 1) + 1usize; + out_buf.extend_from_slice(encode_vlq(n, &mut buf)); + } else { + let n = blob.len() << 1; + out_buf.extend_from_slice(encode_vlq(n, &mut buf)); + out_buf.extend_from_slice(blob); + } + } + + (out_buf, idx_len) +} + +#[cfg(test)] +mod tests { + use crate::{Error, NEXT_MASK, VAL_MASK, decode::read_vlq}; + + fn encode_vlq(mut val: usize, buf: &mut [u8; 4]) -> &[u8] { + macro_rules! step { + ($n:expr) => { + buf[$n] = if $n == 3 { + (val & (VAL_MASK as usize)) as u8 + } else { + val -= 1; + NEXT_MASK | (val & (VAL_MASK as usize)) as u8 + }; + val >>= 7; + if val == 0 { + return &buf[$n..]; + } + }; + } + + step!(3); + step!(2); + step!(1); + step!(0); + panic!("integer is too big") + } + + #[test] + fn encode_decode() { + let mut buf = [0u8; 4]; + for val in 0..=270549119 { + let mut res = encode_vlq(val, &mut buf); + let val_res = read_vlq(&mut res).unwrap(); + assert_eq!(val, val_res); + assert!(res.is_empty()); + } + } + + #[test] + #[rustfmt::skip] + fn test_vlq() { + let mut example_buf: &[u8] = &[ + 0b0000_0000, // 0 + 0b0000_0010, // 2 + 0b0111_1111, // 127 + 0b1000_0000, 0b0000_0000, // 128 + 0b1111_1111, 0b0111_1111, // 16511 + 0b1000_0000, 0b1000_0000, 0b0000_0000, // 16512 + 0b1111_1111, 0b1111_1111, 0b0111_1111, // 2113663 + 0b1000_0000, 0b1000_0000, 0b1000_0000, 0b0000_0000, // 2113664 + 0b1111_1111, 0b1111_1111, 0b1111_1111, 0b0111_1111, // 270549119 + 0b1111_1111, 0b1111_1111, 0b1111_1111, 0b1111_1111, 0b0111_1111, + ]; + + let targets = [ + (0, 1), + (2, 1), + (127, 1), + (128, 2), + (16511, 2), + (16512, 3), + (2113663, 3), + (2113664, 4), + (270549119, 4), + ]; + + let mut buf = [0u8; 4]; + + let mut rem_len = example_buf.len(); + + for (target_val, target_size) in targets { + assert_eq!(encode_vlq(target_val, &mut buf), &example_buf[..target_size]); + + let val = read_vlq(&mut example_buf).unwrap(); + assert_eq!(val, target_val); + + rem_len -= target_size; + assert_eq!(example_buf.len(), rem_len); + + } + + // Only VLQ values of up to 4 bytes are supported + assert_eq!(read_vlq(&mut example_buf), Err(Error::InvalidVlq)); + } +} diff --git a/blobby/src/lib.rs b/blobby/src/lib.rs index d95dc42b..b5516646 100644 --- a/blobby/src/lib.rs +++ b/blobby/src/lib.rs @@ -4,18 +4,23 @@ html_logo_url = "https://raw.githubusercontent.com/RustCrypto/media/6ee8e381/logo.svg", html_favicon_url = "https://raw.githubusercontent.com/RustCrypto/media/6ee8e381/logo.svg" )] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] +#![deny(unsafe_code)] + +#[cfg(feature = "alloc")] extern crate alloc; -use alloc::{boxed::Box, collections::BTreeMap, vec, vec::Vec}; +pub(crate) mod decode; +#[cfg(feature = "alloc")] +pub use decode::parse_into_vec; +pub use decode::{parse_dedup_len, parse_into_array, parse_items_len}; -/// Iterator over binary blobs -pub struct BlobIterator<'a> { - data: &'a [u8], - dedup: Box<[&'a [u8]]>, - pos: usize, -} +#[cfg(feature = "alloc")] +mod encode; +#[cfg(feature = "alloc")] +pub use encode::encode_blobs; -/// `blobby` error type +/// Error type used by `blobby` functions #[derive(Debug, Eq, PartialEq, Copy, Clone)] pub enum Error { /// Decoded VLQ number is too big @@ -26,319 +31,9 @@ pub enum Error { UnexpectedEnd, /// Not enough elements for `BlobNIterator` NotEnoughElements, + /// Bad array length was provided to [`parse_as_array`] + BadArrayLen, } const NEXT_MASK: u8 = 0b1000_0000; const VAL_MASK: u8 = 0b0111_1111; - -/// Read a git-flavoured VLQ value from `&data[*pos..]`. -/// Increments `pos` to a number of read bytes. -/// -/// This function returns `None` if buffer does not contain enough bytes -/// or if VLQ is bigger than 4 bytes. -/// -/// See the test submodule for example values. -fn read_vlq(data: &[u8], pos: &mut usize) -> Result { - let b = data.get(*pos).ok_or(Error::UnexpectedEnd)?; - *pos += 1; - let mut next = b & NEXT_MASK; - let mut val = (b & VAL_MASK) as usize; - - macro_rules! step { - () => { - if next == 0 { - return Ok(val); - } - let b = data.get(*pos).ok_or(Error::UnexpectedEnd)?; - *pos += 1; - next = b & NEXT_MASK; - let t = (b & VAL_MASK) as usize; - val = ((val + 1) << 7) + t; - }; - } - - step!(); - step!(); - step!(); - - if next != 0 { - return Err(Error::InvalidVlq); - } - - Ok(val) -} - -/// Write a git-flavoured VLQ value into `buf`. -/// -/// Returns the slice within `buf` that holds the value. -fn encode_vlq(mut val: usize, buf: &mut [u8; 4]) -> &[u8] { - macro_rules! step { - ($n:expr) => { - buf[$n] = if $n == 3 { - (val & (VAL_MASK as usize)) as u8 - } else { - val -= 1; - NEXT_MASK | (val & (VAL_MASK as usize)) as u8 - }; - val >>= 7; - if val == 0 { - return &buf[$n..]; - } - }; - } - - step!(3); - step!(2); - step!(1); - step!(0); - panic!("integer is too big") -} - -/// Encode the given collection of binary blobs in .blb format into `writer`. -/// Returns the encoded data together with a count of the number of blobs included in the index. -/// -/// The encoded file format is: -/// - count of index entries=N -/// - N x index entries, each encoded as: -/// - size L of index entry (VLQ) -/// - index blob contents (L bytes) -/// - repeating encoded blobs, each encoded as: -/// - VLQ value that is either: -/// - (J << 1) & 0x01: indicates this blob is index entry J -/// - (L << 1) & 0x00: indicates an explicit blob of len L -/// - (in the latter case) explicit blob contents (L bytes) -pub fn encode_blobs<'a, I, T>(blobs: &'a I) -> (Vec, usize) -where - &'a I: IntoIterator, - T: AsRef<[u8]> + 'a, -{ - let mut idx_map = BTreeMap::new(); - blobs - .into_iter() - .map(|v| v.as_ref()) - .filter(|blob| !blob.is_empty()) - .for_each(|blob| { - let v = idx_map.entry(blob.as_ref()).or_insert(0); - *v += 1; - }); - - let mut idx: Vec<&[u8]> = idx_map - .iter() - .filter(|&(_, &v)| v > 1) - .map(|(&k, _)| k) - .collect(); - idx.sort_by_key(|e| { - let k = match e { - [0] => 2, - [1] => 1, - _ => 0, - }; - (k, idx_map.get(e).unwrap()) - }); - idx.reverse(); - let idx_len = idx.len(); - - let rev_idx: BTreeMap<&[u8], usize> = idx.iter().enumerate().map(|(i, &e)| (e, i)).collect(); - - let mut out_buf = Vec::new(); - let mut buf = [0u8; 4]; - out_buf.extend_from_slice(encode_vlq(idx.len(), &mut buf)); - for e in idx { - out_buf.extend_from_slice(encode_vlq(e.len(), &mut buf)); - out_buf.extend_from_slice(e); - } - - for blob in blobs.into_iter().map(|v| v.as_ref()) { - if let Some(dup_pos) = rev_idx.get(blob) { - let n = (dup_pos << 1) + 1usize; - out_buf.extend_from_slice(encode_vlq(n, &mut buf)); - } else { - let n = blob.len() << 1; - out_buf.extend_from_slice(encode_vlq(n, &mut buf)); - out_buf.extend_from_slice(blob); - } - } - - (out_buf, idx_len) -} - -impl<'a> BlobIterator<'a> { - /// Create new `BlobIterator` for given `data`. - pub fn new(data: &'a [u8]) -> Result { - let mut pos = 0; - let dedup_n = read_vlq(data, &mut pos)?; - - let mut dedup: Vec<&[u8]> = vec![&[]; dedup_n]; - for entry in dedup.iter_mut() { - let m = read_vlq(data, &mut pos).unwrap(); - *entry = &data[pos..pos + m]; - pos += m; - } - Ok(BlobIterator { - data: &data[pos..], - dedup: dedup.into_boxed_slice(), - pos: 0, - }) - } - - fn read(&mut self) -> Result<&'a [u8], Error> { - let val = read_vlq(self.data, &mut self.pos).unwrap(); - // the least significant bit is used as a flag - let is_ref = (val & 1) != 0; - let val = val >> 1; - if is_ref { - if val >= self.dedup.len() { - return Err(Error::InvalidIndex); - } - Ok(self.dedup[val]) - } else { - let s = self.pos; - self.pos += val; - Ok(self.data.get(s..self.pos).ok_or(Error::UnexpectedEnd)?) - } - } - - fn error_block(&mut self) { - self.pos = self.data.len(); - } -} - -impl<'a> Iterator for BlobIterator<'a> { - type Item = Result<&'a [u8], Error>; - - fn next(&mut self) -> Option { - if self.pos < self.data.len() { - let val = self.read(); - if val.is_err() { - self.error_block() - } - Some(val) - } else { - None - } - } -} - -// TODO: use const generics on stabilization -// docs are not generated due to https://github.com/rust-lang/rust/issues/52607 -macro_rules! new_iter { - ($name:ident, $n:expr) => { - pub struct $name<'a> { - inner: BlobIterator<'a>, - } - - impl<'a> $name<'a> { - pub fn new(data: &'a [u8]) -> Result { - BlobIterator::new(data).map(|inner| Self { inner }) - } - } - - impl<'a> Iterator for $name<'a> { - type Item = Result<[&'a [u8]; $n], Error>; - - fn next(&mut self) -> Option { - let mut res: [&[u8]; $n] = Default::default(); - - for (i, v) in res.iter_mut().enumerate() { - *v = match self.inner.next() { - Some(Ok(val)) => val, - Some(Err(e)) => return Some(Err(e)), - None if i == 0 => return None, - None => { - self.inner.error_block(); - return Some(Err(Error::NotEnoughElements)); - } - }; - } - Some(Ok(res)) - } - } - }; -} - -new_iter!(Blob2Iterator, 2); -new_iter!(Blob3Iterator, 3); -new_iter!(Blob4Iterator, 4); -new_iter!(Blob5Iterator, 5); -new_iter!(Blob6Iterator, 6); - -#[cfg(test)] -mod tests { - use super::{Error, NEXT_MASK, VAL_MASK, read_vlq}; - - fn encode_vlq(mut val: usize, buf: &mut [u8; 4]) -> &[u8] { - macro_rules! step { - ($n:expr) => { - buf[$n] = if $n == 3 { - (val & (VAL_MASK as usize)) as u8 - } else { - val -= 1; - NEXT_MASK | (val & (VAL_MASK as usize)) as u8 - }; - val >>= 7; - if val == 0 { - return &buf[$n..]; - } - }; - } - - step!(3); - step!(2); - step!(1); - step!(0); - panic!("integer is too big") - } - - #[test] - fn encode_decode() { - let mut buf = [0u8; 4]; - for val in 0..=270549119 { - let res = encode_vlq(val, &mut buf); - let val_res = read_vlq(res, &mut 0).unwrap(); - assert_eq!(val, val_res); - } - } - - #[test] - #[rustfmt::skip] - fn test_vlq() { - let mut pos = 0; - let examples = [ - 0b0000_0000, // 0 - 0b0000_0010, // 2 - 0b0111_1111, // 127 - 0b1000_0000, 0b0000_0000, // 128 - 0b1111_1111, 0b0111_1111, // 16511 - 0b1000_0000, 0b1000_0000, 0b0000_0000, // 16512 - 0b1111_1111, 0b1111_1111, 0b0111_1111, // 2113663 - 0b1000_0000, 0b1000_0000, 0b1000_0000, 0b0000_0000, // 2113664 - 0b1111_1111, 0b1111_1111, 0b1111_1111, 0b0111_1111, // 270549119 - 0b1111_1111, 0b1111_1111, 0b1111_1111, 0b1111_1111, 0b0111_1111, - ]; - - let targets = [ - (0, 1), - (2, 1), - (127, 1), - (128, 2), - (16511, 2), - (16512, 3), - (2113663, 3), - (2113664, 4), - (270549119, 4), - ]; - - let mut buf = [0u8; 4]; - - for &(val, size) in targets.iter() { - let prev_pos = pos; - assert_eq!(read_vlq(&examples, &mut pos), Ok(val)); - assert_eq!(pos - prev_pos, size); - assert_eq!(encode_vlq(val, &mut buf), &examples[prev_pos..pos]); - } - - // only VLQ values of up to 4 bytes are supported - assert_eq!(read_vlq(&examples, &mut pos), Err(Error::InvalidVlq)); - assert_eq!(pos, 25); - } -}