From b5a2f5cf9a627c9e7977a7ceccec40516387c77f Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Fri, 1 Nov 2024 16:15:41 -0500 Subject: [PATCH 01/11] Add `utf8-converter`. --- README.md | 1 + crates/utf8-converter/Cargo.toml | 10 + crates/utf8-converter/README.md | 12 + crates/utf8-converter/src/bitrank.rs | 504 +++++++++++++++++++++++ crates/utf8-converter/src/lib.rs | 577 +++++++++++++++++++++++++++ 5 files changed, 1104 insertions(+) create mode 100644 crates/utf8-converter/Cargo.toml create mode 100644 crates/utf8-converter/README.md create mode 100644 crates/utf8-converter/src/bitrank.rs create mode 100644 crates/utf8-converter/src/lib.rs diff --git a/README.md b/README.md index 6232e9d..8d42c06 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ A collection of useful algorithms written in Rust. Currently contains: - [`geo_filters`](crates/geo_filters): probabilistic data structures that solve the [Distinct Count Problem](https://en.wikipedia.org/wiki/Count-distinct_problem) using geometric filters. - [`bpe`](crates/bpe): fast, correct, and novel algorithms for the [Byte Pair Encoding Algorithm](https://en.wikipedia.org/wiki/Large_language_model#BPE) which are particularly useful for chunking of documents. +- [`utf8-converter`](crates/utf8-converter): converts string positions between bytes, chars, UTF-16 code units, and line numbers. Useful when sending string indices across language boundaries. ## Background diff --git a/crates/utf8-converter/Cargo.toml b/crates/utf8-converter/Cargo.toml new file mode 100644 index 0000000..49da760 --- /dev/null +++ b/crates/utf8-converter/Cargo.toml @@ -0,0 +1,10 @@ +[package] +authors = ["The blackbird team "] +edition = "2021" +name = "utf8-converter" +version = "0.1.0" + +[dependencies] +itertools = "0.13" +rand = "0.8" +rand_chacha = "0.3" diff --git a/crates/utf8-converter/README.md b/crates/utf8-converter/README.md new file mode 100644 index 0000000..668d35d --- /dev/null +++ b/crates/utf8-converter/README.md @@ -0,0 +1,12 @@ +# UTF-8 Converter + +This crate converts string positions between Rust style (UTF-8 byte offsets) and styles used by other programming languages, as well as line numbers. + +## Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +utf8-converter = "0.1" +``` diff --git a/crates/utf8-converter/src/bitrank.rs b/crates/utf8-converter/src/bitrank.rs new file mode 100644 index 0000000..fc2fa87 --- /dev/null +++ b/crates/utf8-converter/src/bitrank.rs @@ -0,0 +1,504 @@ +//! A bit-vector data structure, optimized for +//! [rank](http://bitmagic.io/rank-select.html) operations. +//! +//! There is also an opportunistic `select` operation, but the general case has not been +//! implemented. + +type Chunk = u128; + +// Static sizing of the various components of the data structure. +const BITS_PER_BLOCK: usize = 16384; +const BITS_PER_SUB_BLOCK: usize = 128; +const SUB_BLOCKS_PER_BLOCK: usize = BITS_PER_BLOCK / BITS_PER_SUB_BLOCK; +const BITS_PER_CHUNK: usize = 128; +const CHUNKS_PER_SUB_BLOCK: usize = BITS_PER_SUB_BLOCK / BITS_PER_CHUNK; + +/// A container for a portion of the total bit vector and the associated indices. +/// The bits within each chunk are stored from most significant bit (msb) to least significant bit (lsb). +/// i.e. index 0 of a Chunk is at the start of visual binary representation or a value of +/// 1u128 << 127. +/// +/// The actual bits are stored alongside the indices because the common case will be reading this +/// information from disk (rather than random access memory), so it is beneficial to have all of +/// the data that we need in the same page. +/// +/// ```text +/// index: [ 0, 1, 2, 3, 4, 5, 6, 7 ] +/// bits: [ 0, 1, 0, 1, 1, 0, 1, 0 ] +/// rank(exclusive): [ 0, 0, 1, 1, 2, 3, 3, 4 ] +/// block rank: [ 0 ] +/// sub-block rank: [ 0 ][ 2 ] +/// ``` +#[derive(Clone, Debug)] +#[repr(C)] +struct Block { + /// Rank of the first bit in this block (that is, the number of bits set in previous blocks). + rank: u64, + /// Rank of the first bit (bit 0) of each subblock, relative to the start of the block. + /// That is, `sub_blocks[i]` is the number of bits set in the `bits` representing + /// sub-blocks `0..i`. `sub_blocks[0]` is always zero. + sub_blocks: [u16; SUB_BLOCKS_PER_BLOCK], + /// The bit-vector. + bits: [Chunk; BITS_PER_BLOCK / BITS_PER_CHUNK], +} + +impl Block { + fn new(rank: u64) -> Self { + Self { + rank, + sub_blocks: [0; SUB_BLOCKS_PER_BLOCK], + bits: [0; BITS_PER_BLOCK / BITS_PER_CHUNK], + } + } + + /// Set a bit without updating `self.sub_blocks`. + /// + /// This panics if the bit was already set, because that indicates that the original positions + /// list is invalid/had duplicates. + fn set(&mut self, index: usize) { + assert!(index < BITS_PER_BLOCK); + let chunk_idx = index / BITS_PER_CHUNK; + let bit_idx = index % BITS_PER_CHUNK; + let mask = 1 << ((BITS_PER_CHUNK - 1) - bit_idx); + assert_eq!(self.bits[chunk_idx] & mask, 0, "toggling bits off indicates that the original data was incorrect, most likely containing duplicate values."); + self.bits[chunk_idx] ^= mask; + } + + /// Tests whether the bit at the given index is set. + #[allow(dead_code)] + fn get(&self, index: usize) -> bool { + assert!(index < BITS_PER_BLOCK); + let chunk_idx = index / BITS_PER_CHUNK; + let bit_idx = index % BITS_PER_CHUNK; + let mask = 1 << ((BITS_PER_CHUNK - 1) - bit_idx); + self.bits[chunk_idx] & mask != 0 + } + + /// The **total rank** of the block relative local index, and the index of the one + /// bit that establishes that rank (aka "select") **if** it occurs within that same + /// chunk, otherwise ['None']. The assumption is that if you would have to look back + /// through previous chunks it would actually be cheaper to do a lookup in the original + /// data structure that the bit vector was created from. + fn rank_select(&self, local_idx: usize) -> (usize, Option) { + let mut rank = self.rank as usize; + let sub_block = local_idx / BITS_PER_SUB_BLOCK; + rank += self.sub_blocks[sub_block] as usize; + + if BITS_PER_CHUNK != BITS_PER_SUB_BLOCK { + for i in sub_block * CHUNKS_PER_SUB_BLOCK..local_idx / BITS_PER_CHUNK { + rank += self.bits[i].count_ones() as usize; + } + } + + let remainder = local_idx % BITS_PER_CHUNK; + + let last_chunk = local_idx / BITS_PER_CHUNK; + let masked = if remainder == 0 { + 0 + } else { + self.bits[last_chunk] >> (BITS_PER_CHUNK - remainder) + }; + rank += masked.count_ones() as usize; + let select = if masked == 0 { + None + } else { + Some(local_idx - masked.trailing_zeros() as usize - 1) + }; + (rank, select) + } + + fn total_rank(&self) -> usize { + self.sub_blocks[SUB_BLOCKS_PER_BLOCK - 1] as usize + + self.rank as usize + + self.bits[(SUB_BLOCKS_PER_BLOCK - 1) * CHUNKS_PER_SUB_BLOCK..] + .iter() + .map(|c| c.count_ones() as usize) + .sum::() + } + + fn predecessor(&self, idx: usize) -> Option { + let sub_block = idx / BITS_PER_SUB_BLOCK; + let masked = self.bits[sub_block] >> (BITS_PER_SUB_BLOCK - 1 - idx % BITS_PER_SUB_BLOCK); + if masked > 0 { + Some(idx - masked.trailing_zeros() as usize) + } else { + for i in (0..sub_block).rev() { + let masked = self.bits[i]; + if masked > 0 { + return Some( + (i + 1) * BITS_PER_SUB_BLOCK - masked.trailing_zeros() as usize - 1, + ); + } + } + None + } + } + + fn successor(&self, idx: usize) -> Option { + let sub_block = idx / BITS_PER_SUB_BLOCK; + let masked = self.bits[sub_block] << (idx % BITS_PER_SUB_BLOCK); + if masked > 0 { + Some(idx + masked.leading_zeros() as usize) + } else { + for i in (sub_block + 1)..SUB_BLOCKS_PER_BLOCK { + let masked = self.bits[i]; + if masked > 0 { + return Some(i * BITS_PER_SUB_BLOCK + masked.leading_zeros() as usize); + } + } + None + } + } +} + +impl Default for Block { + fn default() -> Self { + Block { + rank: 0, + sub_blocks: [0u16; SUB_BLOCKS_PER_BLOCK], + bits: [0; BITS_PER_BLOCK / BITS_PER_CHUNK], + } + } +} + +/// Builder for creating a [`BitRank`]. +/// +/// # Examples +/// +/// ```text +/// // Note: This should work as a doctest, except this module is not public. +/// let mut bytes = Vec::::new(); +/// +/// let mut builder = BitRankBuilder::new(); +/// builder.push(17); +/// builder.push(23); +/// builder.push(102); +/// let set = builder.finish(); +/// assert_eq!(set.rank(100), 2); +/// ``` +#[derive(Default)] +pub struct BitRankBuilder { + blocks: Vec, + curr_rank: u64, + curr_block_id: usize, + curr_block: Option, +} + +impl BitRankBuilder { + /// Returns a new builder. + pub fn new() -> Self { + Self::default() + } + + fn push_block(&mut self, mut block: Block) -> u64 { + let mut local_rank = 0; + for (i, chunk) in block.bits.iter().enumerate() { + // If the settings are ever changed, CHUNKS_PER_SUB_BLOCK will likely no longer be 1, so + // you will need this modulo. + #[expect(clippy::modulo_one)] + if i % CHUNKS_PER_SUB_BLOCK == 0 { + block.sub_blocks[i / CHUNKS_PER_SUB_BLOCK] = local_rank; + } + local_rank += chunk.count_ones() as u16; + } + let end_rank = block.rank + local_rank as u64; + self.blocks.push(block); + end_rank + } + + /// Adds a bit. Bits must be added in order of increasing `position`. + pub fn push(&mut self, position: usize) { + let block_id = position / BITS_PER_BLOCK; + assert!( + self.curr_block_id <= block_id, + "positions must be increasing!" + ); + while block_id > self.curr_block_id { + let curr_block = self + .curr_block + .take() + .unwrap_or_else(|| Block::new(self.curr_rank)); + let end_rank = self.push_block(curr_block); + self.curr_rank = end_rank; + self.curr_block_id += 1; + } + match &mut self.curr_block { + None => { + let mut block = Block::new(self.curr_rank); + block.set(position % BITS_PER_BLOCK); + self.curr_block = Some(block); + } + Some(block) => { + block.set(position % BITS_PER_BLOCK); + } + } + } + + /// Finishes the `BitRank` by writing the last block of data. + pub fn finish(mut self) -> BitRank { + if let Some(last_block) = self.curr_block.take() { + self.push_block(last_block); + } + BitRank { + blocks: self.blocks, + } + } +} + +/// An immutable set of unsigned integers with an efficient `rank` method. +#[derive(Clone)] +pub struct BitRank { + blocks: Vec, +} + +impl BitRank { + /// Creates a `BitRank` containing the integers in `iter`. + /// + /// # Panics + /// This may panic if the values produced by `iter` are not strictly increasing. + #[allow(clippy::should_implement_trait)] + #[allow(dead_code)] + pub fn from_iter>(iter: I) -> BitRank { + let mut builder = BitRankBuilder::new(); + for position in iter { + builder.push(position); + } + builder.finish() + } + + /// The rank at the specified index (exclusive). + /// + /// The (one) rank is defined as: `rank(i) = sum(b[j] for j in 0..i)` + /// i.e. the number of elements less than `i`. + pub fn rank(&self, idx: usize) -> usize { + self.rank_select(idx).0 + } + + /// Tests whether the bit at the given index is set. + #[allow(dead_code)] + pub fn get(&self, idx: usize) -> bool { + let block_num = idx / BITS_PER_BLOCK; + // assert!(block_num < self.blocks.len(), "index out of bounds"); + if block_num >= self.blocks.len() { + false + } else { + self.blocks[block_num].get(idx % BITS_PER_BLOCK) + } + } + + /// Returns the 1 bit at or before the specified index. + #[allow(dead_code)] + pub fn predecessor(&self, idx: usize) -> usize { + let block_num = idx / BITS_PER_BLOCK; + if block_num < self.blocks.len() { + if let Some(p) = self.blocks[block_num].predecessor(idx % BITS_PER_BLOCK) { + return block_num * BITS_PER_BLOCK + p; + } + } + for block_num in (0..self.blocks.len().min(block_num)).rev() { + if let Some(p) = self.blocks[block_num].predecessor(BITS_PER_BLOCK - 1) { + return block_num * BITS_PER_BLOCK + p; + } + } + panic!("no predecessor found!"); + } + + /// Returns the next 1 bit at or after the specified index. + #[allow(dead_code)] + pub fn successor(&self, idx: usize) -> usize { + let block_num = idx / BITS_PER_BLOCK; + if let Some(s) = self.blocks[block_num].successor(idx % BITS_PER_BLOCK) { + s + block_num * BITS_PER_BLOCK + } else { + for block_num in block_num + 1..self.blocks.len() { + if let Some(p) = self.blocks[block_num].successor(0) { + return block_num * BITS_PER_BLOCK + p; + } + } + panic!("no successor found!"); + } + } + + /// Returns the number of elements in the set. + pub fn max_rank(&self) -> usize { + self.blocks + .last() + .map(|b| b.total_rank()) + .unwrap_or_default() // fall back to 0 when the bitrank data structure is empty. + } + + /// The rank at the specified index(exclusive) and the index of the one bit that + /// establishes that rank (aka "select") **if** it occurs within that same chunk, + /// otherwise ['None']. The assumption is that if you would have to look back + /// through previous chunks it would actually be cheaper to do a lookup in the original + /// data structure that the bit vector was created from. + pub fn rank_select(&self, idx: usize) -> (usize, Option) { + let block_num = idx / BITS_PER_BLOCK; + // assert!(block_num < self.blocks.len(), "index out of bounds"); + if block_num >= self.blocks.len() { + ( + self.max_rank(), // fall back to 0 when the bitrank data structure is empty. + None, + ) + } else { + let (rank, b_idx) = self.blocks[block_num].rank_select(idx % BITS_PER_BLOCK); + (rank, b_idx.map(|i| (block_num * BITS_PER_BLOCK) + i)) + } + } + + /// The total size of the bit vec that was allocated. + /// **Note:** This is more like capacity than normal `len` in that it does not + /// consider how much of the bit vec is actually used. + #[allow(dead_code)] + pub fn capacity(&self) -> usize { + self.blocks.len() * BITS_PER_BLOCK + } +} + +#[cfg(test)] +mod tests { + use itertools::Itertools; + use rand::distributions::Uniform; + use rand::prelude::*; + use rand_chacha::ChaCha8Rng; + + use super::*; + + fn write(positions: &[usize]) -> BitRank { + BitRank::from_iter(positions.iter().copied()) + } + + #[test] + fn test_rank_zero() { + let br = BitRank::from_iter([0]); + assert_eq!(br.rank(0), 0); + assert_eq!(br.rank(1), 1); + } + + #[test] + fn test_empty() { + let br = BitRank::from_iter([]); + assert!(br.blocks.is_empty()); + } + + #[test] + fn test_index_out_of_bounds() { + let br = BitRank::from_iter([BITS_PER_BLOCK - 1]); + assert_eq!(br.rank(BITS_PER_BLOCK), 1); + } + + #[test] + #[should_panic] + fn test_duplicate_position() { + write(&[64, 66, 68, 68, 90]); + } + + #[test] + fn test_rank_exclusive() { + let br = BitRank::from_iter(0..132); + assert_eq!(br.capacity(), BITS_PER_BLOCK); + assert_eq!(br.rank(64), 64); + assert_eq!(br.rank(132), 132); + } + + #[test] + fn test_rank() { + let mut positions: Vec = (0..132).collect(); + positions.append(&mut vec![138usize, 140, 146]); + let br = write(&positions); + assert_eq!(br.rank(135), 132); + + let bits2: Vec = (0..BITS_PER_BLOCK - 5).collect(); + let br2 = write(&bits2); + assert_eq!(br2.rank(169), 169); + + let bits3: Vec = (0..BITS_PER_BLOCK + 5).collect(); + let br3 = write(&bits3); + assert_eq!(br3.rank(BITS_PER_BLOCK), BITS_PER_BLOCK); + } + + #[test] + fn test_rank_idx() { + let mut positions: Vec = (0..132).collect(); + positions.append(&mut vec![138usize, 140, 146]); + let br = write(&positions); + assert_eq!(br.rank_select(135), (132, Some(131))); + + let bits2: Vec = (0..BITS_PER_BLOCK - 5).collect(); + let br2 = write(&bits2); + assert_eq!(br2.rank_select(169), (169, Some(168))); + + let bits3: Vec = (0..BITS_PER_BLOCK + 5).collect(); + let br3 = write(&bits3); + assert_eq!(br3.rank_select(BITS_PER_BLOCK), (BITS_PER_BLOCK, None)); + + let bits4: Vec = vec![1, 1000, 9999, BITS_PER_BLOCK + 1]; + let br4 = write(&bits4); + assert_eq!(br4.rank_select(10000), (3, Some(9999))); + + let bits5: Vec = vec![1, 1000, 9999, BITS_PER_BLOCK + 1]; + let br5 = write(&bits5); + assert_eq!(br5.rank_select(BITS_PER_BLOCK), (3, None)); + } + + #[test] + fn test_rank_large_random() { + let mut rng = ChaCha8Rng::seed_from_u64(2); + let uniform = Uniform::::from(0..1_000_000); + let mut random_bits = Vec::with_capacity(100_000); + for _ in 0..100_000 { + random_bits.push(uniform.sample(&mut rng)); + } + random_bits.sort_unstable(); + // This isn't strictly necessary, given that the bit would just be toggled again, but it + // ensures that we are meeting the contract. + random_bits.dedup(); + let br = write(&random_bits); + let mut rank = 0; + let mut select = None; + for i in 0..random_bits.capacity() { + if i % BITS_PER_CHUNK == 0 { + select = None; + } + assert_eq!(br.rank_select(i), (rank, select)); + if i == random_bits[rank] { + rank += 1; + select = Some(i); + } + } + } + + /// Test that we properly handle the case where the position is out of bounds for all + /// potentially tricky bit positions. + #[test] + fn test_rank_out_of_bounds() { + for i in 1..30 { + let br = write(&[BITS_PER_BLOCK * i - 1]); + assert_eq!(br.max_rank(), 1); + assert_eq!(br.rank(BITS_PER_BLOCK * i - 1), 0); + for j in 0..10 { + assert_eq!(br.rank(BITS_PER_BLOCK * (i + j)), 1); + } + } + } + + #[test] + fn test_predecessor_and_successor() { + let mut rng = ChaCha8Rng::seed_from_u64(2); + let uniform = Uniform::::from(0..1_000_000); + let mut random_bits = Vec::with_capacity(100_000); + for _ in 0..100_000 { + random_bits.push(uniform.sample(&mut rng)); + } + random_bits.sort_unstable(); + random_bits.dedup(); + let br = write(&random_bits); + + for (i, j) in random_bits.iter().copied().tuple_windows() { + for k in i..j { + assert_eq!(br.successor(k + 1), j, "{i} {k} {j}"); + assert_eq!(br.predecessor(k), i, "{i} {k} {j}"); + } + } + } +} diff --git a/crates/utf8-converter/src/lib.rs b/crates/utf8-converter/src/lib.rs new file mode 100644 index 0000000..182583d --- /dev/null +++ b/crates/utf8-converter/src/lib.rs @@ -0,0 +1,577 @@ +//! Position calculator to convert between byte, char, and line positions. + +use std::ops::Range; + +mod bitrank; + +use bitrank::{BitRank, BitRankBuilder}; + +/// Position calculator to convert between byte, char, and line positions. +/// +/// Rust strings are UTF-8, but JavaScript has UTF-16 strings, while in Python, strings are +/// sequences of Unicode code points. It's therefore necessary to adjust string positions when +/// communicating across programming language boundaries. [`Utf8Converter`] does these adjustments. +/// +/// ## Converting offsets +/// +/// The conversion methods follow a naming scheme that uses these terms for different kinds of +/// offsets: +/// +/// - `utf8` - UTF-8 byte offsets (Rust style). +/// - `utf16` - UTF-16 code unit offsets (JavaScript style). +/// - `char` - Count of Unicode scalar values (Python style). +/// - `utf16_pos` - Zero-based line number and `utf16` offset within the line. +/// - `char_pos` - Zero-based line number and `char` offset within the line. +/// +/// For example, [`Utf8Converter::utf8_to_utf16`] converts a Rust byte offset to a number that will +/// index to the same position in a JavaScript string. Offsets are expressed as `u32` or [`Pos`] +/// values. +/// +/// All methods accept arguments that are off the end of the string (interpreting them as the end +/// of the string). +/// +/// ## Converting ranges +/// +/// Some methods translate position *ranges*. These are expressed as `Range` except for +/// `line`, which is a `u32`: +/// +/// - `line` - Zero-based line numbers. The range a `line` refers to is the whole line, including +/// the trailing newline character if any. +/// - `lines` - A range of line numbers. +/// - `utf8s` - UTF-8 byte ranges. +/// - `utf16s` - UTF-16 code unit ranges. +/// - `chars` - Ranges of Unicode scalar values. +/// +/// When mapping offsets to line ranges, it is important to use a `_to_lines` function in order to +/// end up with the correct line range. We have these methods because if you tried to do it +/// yourself you would screw it up; use them! (And see the source code for +/// [`Utf8Converter::utf8s_to_lines`] if you don't believe us.) +/// +/// ## Complexity +/// +/// Most operations run in O(1) time, some require O(log n) time. The memory consumed by this data +/// structure is typically less than the memory occupied by the actual content. In the best case, +/// it requires ~25% of the content space. +pub struct Utf8Converter { + // Vector storing for every line the byte position at which the line starts. + line_begins: Vec, + + // Encoded bitrank where the rank of a byte position corresponds to the line number to which + // the byte belongs. + utf8_to_line: BitRank, + + // Encoded bitrank where the rank of a byte position corresponds to the char position to which + // the byte belongs. + utf8_to_char: BitRank, + + // Encoded bitrank where the rank of a byte position corresponds to the UTF-16 encoded word + // position to which the byte belongs. + utf8_to_utf16: BitRank, + + // Marks for every line whether it only consists of whitespace characters. + whitespace_only: Vec, +} + +/// A position in a string, specified by line and column number. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Pos { + /// Zero-indexed line number. + pub line: u32, + /// Zero-indexed column number. The units of this field depend on the method that produces the + /// value. See [`Utf8Converter::utf8_to_char_pos`], [`Utf8Converter::utf8_to_utf16_pos`]. + pub col: u32, +} + +// The actual conversion implementation between utf8, utf16, chars, and line numbers. +// New methods must follow the existing conventions: +// +// - All conversions saturate when the input is out of bounds. +// - Lines INCLUDE the terminating newline. +// - Line numbers and column numbers are 0-based. +// - `.xyz_to_lines(range)` methods behave like `.utf8_to_lines(the corresponding byte range)`. +// +// This last one is tricky, because in these methods, `range.begin` "rounds down" to the beginning +// of the line, but `range.end` "rounds up"; and because there are many corner cases. +// +// E.g.: The empty character range at the end of one line cannot be distinguished from the empty +// character range at the end of the subsequent line! This ambiguity is resolved by returning the +// line which starts with the empty character range. +// +// Question: Consider whether we should return an empty line range in this case which would +// probably be consistent from a mathematical point of view. But then we should also return empty +// line ranges for empty character ranges in the middle of a line... +impl Utf8Converter { + /// Collects position information for the given string. + pub fn new(content: &str) -> Self { + new_converter(content.as_bytes()) + } + + /// Collects position information for a byte-string. + /// + /// If `content` is UTF-8, this is just like [`Utf8Converter::new`]. Otherwise, the + /// conversion methods involving characters will produce unspecified (but memory-safe) results. + pub fn from_bytes(content: &[u8]) -> Self { + new_converter(content) + } + + /// Returns the number of Unicode characters on the specified line. + pub fn line_chars(&self, line_number: u32) -> u32 { + let r = self.utf8s_to_chars(self.line_to_utf8s(line_number)); + r.end - r.start + } + + /// Returns the number of lines in the string. + pub fn lines(&self) -> u32 { + self.line_begins.len() as u32 - 1 + } + + pub fn only_whitespaces(&self, line_number: u32) -> bool { + self.whitespace_only + .get(line_number as usize) + .copied() + .unwrap_or(true) + } + + /// Return the byte offset of the first character on the specified (zero-based) line. + /// + /// If `line_number` is greater than the number of lines in the text, this returns the length + /// of the string. + pub fn line_to_utf8_begin(&self, line_number: u32) -> u32 { + self.line_begins[line_number.min(self.lines()) as usize] + } + + /// Python-style offset of the first character of a line. + pub fn line_to_char_begin(&self, line_number: u32) -> u32 { + self.utf8_to_char(self.line_to_utf8_begin(line_number)) + } + + /// JS-style offset of the first character of a line. + pub fn line_to_utf16_begin(&self, line_number: u32) -> u32 { + self.utf8_to_utf16(self.line_to_utf8_begin(line_number)) + } + + /// Rust-style offset of the first character of a line. + pub fn line_to_utf8_end(&self, line_number: u32) -> u32 { + self.line_to_utf8_begin(line_number + 1) + } + + /// Python-style offset one past the end of a line (the offset of the start of the next line). + pub fn line_to_char_end(&self, line_number: u32) -> u32 { + self.utf8_to_char(self.line_to_utf8_end(line_number)) + } + + /// JS-style offset one past the end of a line (the offset of the start of the next line). + pub fn line_to_utf16_end(&self, line_number: u32) -> u32 { + self.utf8_to_utf16(self.line_to_utf8_end(line_number)) + } + + /// Rust-style offset one past the end of a line (the offset of the start of the next line). + pub fn line_to_utf8s(&self, line_number: u32) -> Range { + self.line_to_utf8_begin(line_number)..self.line_to_utf8_end(line_number) + } + + /// Python-style offsets for the beginning and end of a line, including the newline if any. + pub fn line_to_chars(&self, line_number: u32) -> Range { + self.utf8s_to_chars(self.line_to_utf8s(line_number)) + } + + /// Rust-style offsets for the beginning and end of a line, including the newline if any. + pub fn lines_to_utf8s(&self, line_numbers: Range) -> Range { + self.line_to_utf8_begin(line_numbers.start)..self.line_to_utf8_begin(line_numbers.end) + } + + /// Python-style offsets for the beginning and end of a range of lines, including the newline + /// of the last line, if any. + pub fn lines_to_chars(&self, line_numbers: Range) -> Range { + self.utf8s_to_chars(self.lines_to_utf8s(line_numbers)) + } + + /// Return the range of line numbers containing the substring specified by the Python-style + /// range `chars`. Newline characters count as part of the preceding line. + pub fn chars_to_lines(&self, chars: Range) -> Range { + self.utf8s_to_lines(self.chars_to_utf8s(chars)) + } + + /// Return the zero-based line number of the line containing the specified Rust-style offset. + /// Newline characters count as part of the preceding line. + pub fn utf8_to_line(&self, byte_number: u32) -> u32 { + self.utf8_to_line.rank(byte_number as usize) as u32 + } + + /// Converts a Rust-style offset to a zero-based line number and Python-style offset within the + /// line. + pub fn utf8_to_char_pos(&self, byte_number: u32) -> Pos { + let line = self.utf8_to_line(byte_number); + let line_start_char_number = self.line_to_char_begin(line); + let char_idx = self.utf8_to_char(byte_number); + Pos { + line, + col: char_idx - line_start_char_number, + } + } + + /// Converts a Rust-style offset to a zero-based line number and JS-style offset within the + /// line. + pub fn utf8_to_utf16_pos(&self, byte_number: u32) -> Pos { + let line = self.utf8_to_line(byte_number); + let line_start_char_number = self.line_to_utf16_begin(line); + let char_idx = self.utf8_to_utf16(byte_number); + Pos { + line, + col: char_idx - line_start_char_number, + } + } + + /// Returns the range of line numbers containing the substring specified by the Rust-style + /// range `bytes`. Newline characters count as part of the preceding line. + /// + /// If `bytes` is an empty range at a position within or at the beginning of a line, this + /// returns a nonempty range containing the line number of that one line. An empty range at or + /// beyond the end of the string translates to an empty range of line numbers. + pub fn utf8s_to_lines(&self, bytes: Range) -> Range { + // The fiddly parts of this formula are necessary because `bytes.start` rounds down to the + // beginning of the line, but `bytes.end` "rounds up" to the end of the line. the final + // `+1` is to produce a half-open range. + self.utf8_to_line(bytes.start) + ..self + .lines() + .min(self.utf8_to_line(bytes.end.saturating_sub(1).max(bytes.start)) + 1) + } + + /// Converts a Rust-style offset to Python style. + pub fn utf8_to_char(&self, byte_number: u32) -> u32 { + self.utf8_to_char.rank(byte_number as usize) as u32 + } + + /// Converts a Rust-style offset to JS style. + pub fn utf8_to_utf16(&self, byte_number: u32) -> u32 { + self.utf8_to_utf16.rank(byte_number as usize) as u32 + } + + /// Converts a Python-style offset to Rust style. + pub fn char_to_utf8(&self, char_number: u32) -> u32 { + let mut byte_number = char_number; + for _ in 0..128 { + let char_number2 = self.utf8_to_char(byte_number); + if char_number2 == char_number { + return byte_number; + } + byte_number += char_number - char_number2; + } + // If we couldn't find the char within 128 steps, then the char_number might be invalid! + // This does not usually happen. For consistency with the rest of the code, we simply return + // the max utf8 position in this case. + if char_number > self.utf8_to_char.max_rank() as u32 { + return self + .line_begins + .last() + .copied() + .expect("last entry represents the length of the file!"); + } + let limit = *self.line_begins.last().expect("no line begins"); + // Otherwise, we keep searching, but are a bit more careful and add a check that we don't run into an infinite loop. + loop { + let char_number2 = self.utf8_to_char(byte_number); + if char_number2 == char_number { + return byte_number; + } + byte_number += char_number - char_number2; + assert!(byte_number < limit); + } + } + + /// Converts a Rust-style offset range to Python style. + pub fn utf8s_to_chars(&self, bytes: Range) -> Range { + self.utf8_to_char(bytes.start)..self.utf8_to_char(bytes.end) + } + + /// Converts a Python-style offset range to Rust style. + pub fn chars_to_utf8s(&self, chars: Range) -> Range { + self.char_to_utf8(chars.start)..self.char_to_utf8(chars.end) + } +} + +fn new_converter(content: &[u8]) -> Utf8Converter { + let mut utf8_builder = BitRankBuilder::new(); + let mut utf16_builder = BitRankBuilder::new(); + let mut line_builder = BitRankBuilder::new(); + let mut line_begins = vec![0]; + let mut i = 0; + let mut whitespace_only = vec![]; + let mut only_whitespaces = true; // true if all characters in the current line are whitespaces. + while i < content.len() { + // In case of invalid utf8, we might get a utf8_len of 0. + // In this case, we just treat the single byte character. + // In principle, a single incorrect byte can break the whole decoding... + let c = content[i]; + let utf8_len = utf8_width(c).max(1); + if i > 0 { + utf8_builder.push(i - 1); + utf16_builder.push(i - 1); + } + if utf8_to_utf16_width(&content[i..]) > 1 { + utf16_builder.push(i); + } + if c == b'\n' { + whitespace_only.push(only_whitespaces); + line_begins.push(i as u32 + 1); + line_builder.push(i); + only_whitespaces = true; // reset for next line. + } else { + only_whitespaces &= matches!(c, b'\t' | b'\r' | b' '); + } + i += utf8_len; + } + if !content.is_empty() { + utf8_builder.push(content.len() - 1); + utf16_builder.push(content.len() - 1); + } + if line_begins.last() != Some(&(content.len() as u32)) { + whitespace_only.push(only_whitespaces); + line_begins.push(content.len() as u32); + line_builder.push(content.len() - 1); + } + + Utf8Converter { + line_begins, + utf8_to_line: line_builder.finish(), + whitespace_only, + utf8_to_char: utf8_builder.finish(), + utf8_to_utf16: utf16_builder.finish(), + } +} + +/// Returns true if, in a UTF-8 string, `b` always indicates the first byte of a character. +/// +/// (This is true for bytes `0..=127` and `192..=255`.) +pub fn is_char_boundary(b: u8) -> bool { + // Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128 + // Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192 + // The rest are bytes belonging to the first byte of multi byte encodings (11xxxxxx): b >= 192 + // + // When interpreting the byte representation as signed integers, then numbers in the range + // 128..192 correspond to the smallest representable numbers. I.e. the two ranges [0, 128) and + // [192, 256) can be tested with a single signed comparison. + b as i8 >= -0x40 // NB: b < 128 || b >= 192 +} + +/// Returns the number of bytes this utf8 char occupies given the first byte of the utf8 encoding. +/// Returns 0 if the byte is not a valid first byte of a utf8 char. +fn utf8_width(c: u8) -> usize { + // Every nibble represents the utf8 length given the first 4 bits of a utf8 encoded byte. + const UTF8_WIDTH: usize = 0x4322_0000_1111_1111; + (UTF8_WIDTH >> ((c >> 4) * 4)) & 0xf +} + +fn utf8_to_utf16_width(content: &[u8]) -> usize { + let len = utf8_width(content[0]); + match len { + 0 => 0, + 1..=3 => 1, + 4 => 2, + _ => panic!("invalid utf8 char width: {}", len), + } +} + +#[cfg(test)] +mod test { + use super::is_char_boundary; + use crate::{utf8_to_utf16_width, utf8_width, Pos, Utf8Converter}; + + #[test] + fn test_utf8_char_width() { + for c in '\0'..=char::MAX { + let mut dst = [0; 4]; + let len = c.encode_utf8(&mut dst).len(); + assert_eq!(len, utf8_width(dst[0]), "char: {:?} {len}", dst[0] >> 4); + } + + for b in 0..=255u8 { + if !is_char_boundary(b) { + assert_eq!(utf8_width(b), 0, "char: {:?}", b >> 4); + } else { + assert!(utf8_width(b) > 0, "char: {:?}", b >> 4); + } + } + } + + #[test] + fn test_utf8_to_utf16_len() { + for c in '\0'..=char::MAX { + let mut dst = [0; 4]; + let _len = c.encode_utf8(&mut dst).len(); + assert_eq!(utf8_to_utf16_width(&dst), c.len_utf16()); + } + + for b in 0..=255u8 { + if !is_char_boundary(b) { + assert_eq!(utf8_to_utf16_width(&[b]), 0); + } + } + } + + #[test] + fn test_line_map() { + let content = r#"a short line. +followed by another one. +no terminating newline!"#; + let lines = Utf8Converter::new(content); + assert_eq!(lines.line_to_utf8s(0), 0..14); + assert_eq!(&content[0..14], "a short line.\n"); + assert_eq!(lines.line_to_utf8s(1), 14..39); + assert_eq!(&content[14..39], "followed by another one.\n"); + assert_eq!(lines.line_to_utf8s(2), 39..62); + assert_eq!(&content[39..62], "no terminating newline!"); + assert_eq!(lines.utf8_to_line(0), 0); + assert_eq!(lines.utf8_to_line(13), 0); + assert_eq!(lines.utf8_to_line(14), 1); + assert_eq!(lines.utf8_to_line(38), 1); + assert_eq!(lines.utf8_to_line(39), 2); + assert_eq!(lines.utf8_to_line(61), 2); + assert_eq!(lines.utf8_to_line(62), 3); // <<-- this character is beyond the content. + assert_eq!(lines.utf8_to_line(100), 3); + assert_eq!(lines.utf8s_to_chars(4..10), 4..10); + assert_eq!(lines.chars_to_utf8s(4..10), 4..10); + + assert_eq!(content.len(), 62); + assert_eq!(lines.lines_to_utf8s(2..3), 39..62); + assert_eq!(lines.lines_to_utf8s(2..4), 39..62); + assert_eq!(lines.lines_to_chars(2..4), 39..62); + assert_eq!(lines.utf8s_to_lines(39..62), 2..3); + assert_eq!(lines.utf8s_to_lines(39..63), 2..3); // The "invalid" utf8 position results in a valid line position. + assert_eq!(lines.char_to_utf8(62), 62); + assert_eq!(lines.char_to_utf8(63), 62); // char 63 doesn't exist, so we map to the closest valid utf8 position. + + // Empty ranges + assert_eq!(lines.utf8s_to_lines(0..0), 0..1); + assert_eq!(lines.utf8s_to_lines(13..13), 0..1); + assert_eq!(lines.utf8s_to_lines(14..14), 1..2); + assert_eq!(lines.utf8s_to_lines(38..38), 1..2); + assert_eq!(lines.utf8s_to_lines(39..39), 2..3); + assert_eq!(lines.utf8s_to_lines(61..61), 2..3); + assert_eq!(lines.utf8s_to_lines(62..62), 3..3); + assert_eq!(lines.utf8s_to_lines(63..63), 3..3); + } + + fn pos(line: u32, col: u32) -> Pos { + Pos { line, col } + } + + #[test] + fn test_convert_ascii() { + let content = r#"line0 +line1"#; + let lines = Utf8Converter::new(content); + assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); + assert_eq!(lines.utf8_to_char_pos(1), pos(0, 1)); + assert_eq!(lines.utf8_to_char_pos(6), pos(1, 0)); + assert_eq!(lines.utf8_to_char_pos(7), pos(1, 1)); + } + + #[test] + fn test_convert_unicode() { + // Á - 2 bytes utf8 + let content = r#"❤️ line0 +line1 +✅ line2"#; + let lines = Utf8Converter::new(content); + assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); // ❤️ takes 6 bytes to represent in utf8 (2 code points) + assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0)); + assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0)); + assert_eq!(lines.utf8_to_char_pos(3), pos(0, 1)); + assert_eq!(lines.utf8_to_char_pos(4), pos(0, 1)); + assert_eq!(lines.utf8_to_char_pos(5), pos(0, 1)); + + assert_eq!(lines.utf8_to_char_pos(6), pos(0, 2)); // + assert_eq!(lines.utf8_to_char_pos(7), pos(0, 3)); // line + // ^ + + assert_eq!(lines.utf8_to_char_pos(13), pos(1, 0)); // line + // ^ + + assert_eq!(lines.utf8_to_char_pos(19), pos(2, 0)); // ✅ takes 3 bytes to represent in utf8 (1 code point) + assert_eq!(lines.utf8_to_char_pos(20), pos(2, 0)); + assert_eq!(lines.utf8_to_char_pos(21), pos(2, 0)); + + assert_eq!(lines.utf8_to_char_pos(22), pos(2, 1)); // + + assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); // ❤️ takes 4 bytes to represent in utf16 (2 code points) + assert_eq!(lines.utf8_to_utf16_pos(1), pos(0, 0)); + assert_eq!(lines.utf8_to_utf16_pos(2), pos(0, 0)); + assert_eq!(lines.utf8_to_utf16_pos(3), pos(0, 1)); + } + + #[test] + fn test_small() { + // Á - 2 bytes utf8 + let content = r#"❤️ line0 ❤️Á 👋"#; + let lines = Utf8Converter::new(content); + let mut utf16_index = 0; + let mut char_index = 0; + for (byte_index, char) in content.char_indices() { + assert_eq!(lines.utf8_to_char(byte_index as u32), char_index); + assert_eq!(lines.utf8_to_utf16(byte_index as u32), utf16_index); + char_index += 1; + utf16_index += char.len_utf16() as u32; + } + assert_eq!(lines.utf8_to_char(content.len() as u32), char_index); + assert_eq!(lines.utf8_to_utf16(content.len() as u32), utf16_index); + } + + #[test] + fn test_variable_lengths() { + let content = r#"❤️Á 👋"#; + // ^~ utf8: 1 char, 4 bytes, utf16: 2 code units + // ^~~~ utf8: 1 char, 1 byte, utf16: 1 code unit + // ^~~~~ utf8: 1 char, 2 bytes, utf16: 1 code unit + // ^~~~~~ utf8: 2 chars, 3 byte ea., utf16: 2 code units + let lines = Utf8Converter::new(content); + + // UTF-16 positions + assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); // ❤️ + assert_eq!(lines.utf8_to_utf16_pos(1), pos(0, 0)); + assert_eq!(lines.utf8_to_utf16_pos(2), pos(0, 0)); + assert_eq!(lines.utf8_to_utf16_pos(3), pos(0, 1)); + assert_eq!(lines.utf8_to_utf16_pos(5), pos(0, 1)); + assert_eq!(lines.utf8_to_utf16_pos(4), pos(0, 1)); + assert_eq!(lines.utf8_to_utf16_pos(6), pos(0, 2)); // Á + assert_eq!(lines.utf8_to_utf16_pos(7), pos(0, 2)); + assert_eq!(lines.utf8_to_utf16_pos(8), pos(0, 3)); // + assert_eq!(lines.utf8_to_utf16_pos(9), pos(0, 4)); // 👋 + + // These middle utf8 byte positions don't have valid mappings: + // assert_eq!(lines.utf8_to_utf16_pos(10), pos(0, 4)); + // assert_eq!(lines.utf8_to_utf16_pos(11), pos(0, 5)); + // + // 👋 in utf16: 0xd83d 0xdc4b + // 👋 in utf8: 0xf0 0x9f 0x91 0x8b + // ^ ^ + // It's not really defined where these inner bytes map to and it + // doesn't matter because we would never report those byte offset as + // they are in the middle of a character and therefore invalid. + + assert_eq!(lines.utf8_to_utf16_pos(12), pos(0, 5)); + + // UTF-8 positions + assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); // ❤️ + assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0)); + assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0)); + assert_eq!(lines.utf8_to_char_pos(3), pos(0, 1)); + assert_eq!(lines.utf8_to_char_pos(4), pos(0, 1)); + assert_eq!(lines.utf8_to_char_pos(5), pos(0, 1)); + assert_eq!(lines.utf8_to_char_pos(6), pos(0, 2)); // Á + assert_eq!(lines.utf8_to_char_pos(7), pos(0, 2)); + assert_eq!(lines.utf8_to_char_pos(8), pos(0, 3)); // + assert_eq!(lines.utf8_to_char_pos(9), pos(0, 4)); // 👋 + assert_eq!(lines.utf8_to_char_pos(10), pos(0, 4)); + assert_eq!(lines.utf8_to_char_pos(11), pos(0, 4)); + assert_eq!(lines.utf8_to_char_pos(12), pos(0, 4)); + } + + #[test] + fn test_critical_input_len() { + let content = [b'a'; 16384]; + let lines = Utf8Converter::from_bytes(&content); + assert_eq!(lines.utf8_to_utf16_pos(16384), pos(1, 0)); + } +} From d913518b7526351ccbbff409e0b8649aa4eb6959 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Fri, 1 Nov 2024 17:22:23 -0500 Subject: [PATCH 02/11] utf8-converter: Require docs for public items. --- crates/utf8-converter/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/utf8-converter/src/lib.rs b/crates/utf8-converter/src/lib.rs index 182583d..4b849f7 100644 --- a/crates/utf8-converter/src/lib.rs +++ b/crates/utf8-converter/src/lib.rs @@ -1,4 +1,5 @@ //! Position calculator to convert between byte, char, and line positions. +#![deny(missing_docs)] use std::ops::Range; @@ -125,6 +126,7 @@ impl Utf8Converter { self.line_begins.len() as u32 - 1 } + /// Returns true if the specified line is empty except for whitespace. pub fn only_whitespaces(&self, line_number: u32) -> bool { self.whitespace_only .get(line_number as usize) From 91ef54d6c9ea3dcc99a7dfb03915604f850fdc08 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Fri, 8 Nov 2024 16:52:21 -0600 Subject: [PATCH 03/11] rename --- .../Cargo.toml | 2 +- .../README.md | 0 .../src/bitrank.rs | 0 .../src/lib.rs | 32 +++++++++---------- 4 files changed, 17 insertions(+), 17 deletions(-) rename crates/{utf8-converter => string-offsets}/Cargo.toml (87%) rename crates/{utf8-converter => string-offsets}/README.md (100%) rename crates/{utf8-converter => string-offsets}/src/bitrank.rs (100%) rename crates/{utf8-converter => string-offsets}/src/lib.rs (96%) diff --git a/crates/utf8-converter/Cargo.toml b/crates/string-offsets/Cargo.toml similarity index 87% rename from crates/utf8-converter/Cargo.toml rename to crates/string-offsets/Cargo.toml index 49da760..5bb425f 100644 --- a/crates/utf8-converter/Cargo.toml +++ b/crates/string-offsets/Cargo.toml @@ -1,7 +1,7 @@ [package] authors = ["The blackbird team "] edition = "2021" -name = "utf8-converter" +name = "string-offests" version = "0.1.0" [dependencies] diff --git a/crates/utf8-converter/README.md b/crates/string-offsets/README.md similarity index 100% rename from crates/utf8-converter/README.md rename to crates/string-offsets/README.md diff --git a/crates/utf8-converter/src/bitrank.rs b/crates/string-offsets/src/bitrank.rs similarity index 100% rename from crates/utf8-converter/src/bitrank.rs rename to crates/string-offsets/src/bitrank.rs diff --git a/crates/utf8-converter/src/lib.rs b/crates/string-offsets/src/lib.rs similarity index 96% rename from crates/utf8-converter/src/lib.rs rename to crates/string-offsets/src/lib.rs index 4b849f7..70611fb 100644 --- a/crates/utf8-converter/src/lib.rs +++ b/crates/string-offsets/src/lib.rs @@ -11,7 +11,7 @@ use bitrank::{BitRank, BitRankBuilder}; /// /// Rust strings are UTF-8, but JavaScript has UTF-16 strings, while in Python, strings are /// sequences of Unicode code points. It's therefore necessary to adjust string positions when -/// communicating across programming language boundaries. [`Utf8Converter`] does these adjustments. +/// communicating across programming language boundaries. [`StringOffsets`] does these adjustments. /// /// ## Converting offsets /// @@ -24,7 +24,7 @@ use bitrank::{BitRank, BitRankBuilder}; /// - `utf16_pos` - Zero-based line number and `utf16` offset within the line. /// - `char_pos` - Zero-based line number and `char` offset within the line. /// -/// For example, [`Utf8Converter::utf8_to_utf16`] converts a Rust byte offset to a number that will +/// For example, [`StringOffsets::utf8_to_utf16`] converts a Rust byte offset to a number that will /// index to the same position in a JavaScript string. Offsets are expressed as `u32` or [`Pos`] /// values. /// @@ -46,14 +46,14 @@ use bitrank::{BitRank, BitRankBuilder}; /// When mapping offsets to line ranges, it is important to use a `_to_lines` function in order to /// end up with the correct line range. We have these methods because if you tried to do it /// yourself you would screw it up; use them! (And see the source code for -/// [`Utf8Converter::utf8s_to_lines`] if you don't believe us.) +/// [`StringOffsets::utf8s_to_lines`] if you don't believe us.) /// /// ## Complexity /// /// Most operations run in O(1) time, some require O(log n) time. The memory consumed by this data /// structure is typically less than the memory occupied by the actual content. In the best case, /// it requires ~25% of the content space. -pub struct Utf8Converter { +pub struct StringOffsets { // Vector storing for every line the byte position at which the line starts. line_begins: Vec, @@ -79,7 +79,7 @@ pub struct Pos { /// Zero-indexed line number. pub line: u32, /// Zero-indexed column number. The units of this field depend on the method that produces the - /// value. See [`Utf8Converter::utf8_to_char_pos`], [`Utf8Converter::utf8_to_utf16_pos`]. + /// value. See [`StringOffsets::utf8_to_char_pos`], [`StringOffsets::utf8_to_utf16_pos`]. pub col: u32, } @@ -101,7 +101,7 @@ pub struct Pos { // Question: Consider whether we should return an empty line range in this case which would // probably be consistent from a mathematical point of view. But then we should also return empty // line ranges for empty character ranges in the middle of a line... -impl Utf8Converter { +impl StringOffsets { /// Collects position information for the given string. pub fn new(content: &str) -> Self { new_converter(content.as_bytes()) @@ -109,7 +109,7 @@ impl Utf8Converter { /// Collects position information for a byte-string. /// - /// If `content` is UTF-8, this is just like [`Utf8Converter::new`]. Otherwise, the + /// If `content` is UTF-8, this is just like [`StringOffsets::new`]. Otherwise, the /// conversion methods involving characters will produce unspecified (but memory-safe) results. pub fn from_bytes(content: &[u8]) -> Self { new_converter(content) @@ -293,7 +293,7 @@ impl Utf8Converter { } } -fn new_converter(content: &[u8]) -> Utf8Converter { +fn new_converter(content: &[u8]) -> StringOffsets { let mut utf8_builder = BitRankBuilder::new(); let mut utf16_builder = BitRankBuilder::new(); let mut line_builder = BitRankBuilder::new(); @@ -334,7 +334,7 @@ fn new_converter(content: &[u8]) -> Utf8Converter { line_builder.push(content.len() - 1); } - Utf8Converter { + StringOffsets { line_begins, utf8_to_line: line_builder.finish(), whitespace_only, @@ -378,7 +378,7 @@ fn utf8_to_utf16_width(content: &[u8]) -> usize { #[cfg(test)] mod test { use super::is_char_boundary; - use crate::{utf8_to_utf16_width, utf8_width, Pos, Utf8Converter}; + use crate::{utf8_to_utf16_width, utf8_width, Pos, StringOffsets}; #[test] fn test_utf8_char_width() { @@ -417,7 +417,7 @@ mod test { let content = r#"a short line. followed by another one. no terminating newline!"#; - let lines = Utf8Converter::new(content); + let lines = StringOffsets::new(content); assert_eq!(lines.line_to_utf8s(0), 0..14); assert_eq!(&content[0..14], "a short line.\n"); assert_eq!(lines.line_to_utf8s(1), 14..39); @@ -463,7 +463,7 @@ no terminating newline!"#; fn test_convert_ascii() { let content = r#"line0 line1"#; - let lines = Utf8Converter::new(content); + let lines = StringOffsets::new(content); assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); assert_eq!(lines.utf8_to_char_pos(1), pos(0, 1)); assert_eq!(lines.utf8_to_char_pos(6), pos(1, 0)); @@ -476,7 +476,7 @@ line1"#; let content = r#"❤️ line0 line1 ✅ line2"#; - let lines = Utf8Converter::new(content); + let lines = StringOffsets::new(content); assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); // ❤️ takes 6 bytes to represent in utf8 (2 code points) assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0)); assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0)); @@ -507,7 +507,7 @@ line1 fn test_small() { // Á - 2 bytes utf8 let content = r#"❤️ line0 ❤️Á 👋"#; - let lines = Utf8Converter::new(content); + let lines = StringOffsets::new(content); let mut utf16_index = 0; let mut char_index = 0; for (byte_index, char) in content.char_indices() { @@ -527,7 +527,7 @@ line1 // ^~~~ utf8: 1 char, 1 byte, utf16: 1 code unit // ^~~~~ utf8: 1 char, 2 bytes, utf16: 1 code unit // ^~~~~~ utf8: 2 chars, 3 byte ea., utf16: 2 code units - let lines = Utf8Converter::new(content); + let lines = StringOffsets::new(content); // UTF-16 positions assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); // ❤️ @@ -573,7 +573,7 @@ line1 #[test] fn test_critical_input_len() { let content = [b'a'; 16384]; - let lines = Utf8Converter::from_bytes(&content); + let lines = StringOffsets::from_bytes(&content); assert_eq!(lines.utf8_to_utf16_pos(16384), pos(1, 0)); } } From 4683f6dcdf23e8625d52d90d504014d4acc476b6 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Fri, 8 Nov 2024 17:01:50 -0600 Subject: [PATCH 04/11] Update with the latest from upstream --- crates/string-offsets/Cargo.toml | 4 +- crates/string-offsets/src/bitrank.rs | 159 +++++++++--------- crates/string-offsets/src/lib.rs | 237 ++++++++++++++------------- 3 files changed, 200 insertions(+), 200 deletions(-) diff --git a/crates/string-offsets/Cargo.toml b/crates/string-offsets/Cargo.toml index 5bb425f..842b7f7 100644 --- a/crates/string-offsets/Cargo.toml +++ b/crates/string-offsets/Cargo.toml @@ -1,10 +1,10 @@ [package] authors = ["The blackbird team "] edition = "2021" -name = "string-offests" +name = "string-offsets" version = "0.1.0" -[dependencies] +[dev-dependencies] itertools = "0.13" rand = "0.8" rand_chacha = "0.3" diff --git a/crates/string-offsets/src/bitrank.rs b/crates/string-offsets/src/bitrank.rs index fc2fa87..88c77cb 100644 --- a/crates/string-offsets/src/bitrank.rs +++ b/crates/string-offsets/src/bitrank.rs @@ -3,15 +3,15 @@ //! //! There is also an opportunistic `select` operation, but the general case has not been //! implemented. +//! +//! See also: ["Succinct data structure"](https://en.wikipedia.org/wiki/Succinct_data_structure). -type Chunk = u128; +type SubblockBits = u128; // Static sizing of the various components of the data structure. const BITS_PER_BLOCK: usize = 16384; -const BITS_PER_SUB_BLOCK: usize = 128; +const BITS_PER_SUB_BLOCK: usize = SubblockBits::BITS as usize; const SUB_BLOCKS_PER_BLOCK: usize = BITS_PER_BLOCK / BITS_PER_SUB_BLOCK; -const BITS_PER_CHUNK: usize = 128; -const CHUNKS_PER_SUB_BLOCK: usize = BITS_PER_SUB_BLOCK / BITS_PER_CHUNK; /// A container for a portion of the total bit vector and the associated indices. /// The bits within each chunk are stored from most significant bit (msb) to least significant bit (lsb). @@ -30,7 +30,6 @@ const CHUNKS_PER_SUB_BLOCK: usize = BITS_PER_SUB_BLOCK / BITS_PER_CHUNK; /// sub-block rank: [ 0 ][ 2 ] /// ``` #[derive(Clone, Debug)] -#[repr(C)] struct Block { /// Rank of the first bit in this block (that is, the number of bits set in previous blocks). rank: u64, @@ -39,38 +38,29 @@ struct Block { /// sub-blocks `0..i`. `sub_blocks[0]` is always zero. sub_blocks: [u16; SUB_BLOCKS_PER_BLOCK], /// The bit-vector. - bits: [Chunk; BITS_PER_BLOCK / BITS_PER_CHUNK], + bits: [SubblockBits; SUB_BLOCKS_PER_BLOCK], } impl Block { - fn new(rank: u64) -> Self { - Self { - rank, - sub_blocks: [0; SUB_BLOCKS_PER_BLOCK], - bits: [0; BITS_PER_BLOCK / BITS_PER_CHUNK], - } - } - /// Set a bit without updating `self.sub_blocks`. /// /// This panics if the bit was already set, because that indicates that the original positions /// list is invalid/had duplicates. fn set(&mut self, index: usize) { assert!(index < BITS_PER_BLOCK); - let chunk_idx = index / BITS_PER_CHUNK; - let bit_idx = index % BITS_PER_CHUNK; - let mask = 1 << ((BITS_PER_CHUNK - 1) - bit_idx); + let chunk_idx = index / BITS_PER_SUB_BLOCK; + let bit_idx = index % BITS_PER_SUB_BLOCK; + let mask = 1 << ((BITS_PER_SUB_BLOCK - 1) - bit_idx); assert_eq!(self.bits[chunk_idx] & mask, 0, "toggling bits off indicates that the original data was incorrect, most likely containing duplicate values."); self.bits[chunk_idx] ^= mask; } /// Tests whether the bit at the given index is set. - #[allow(dead_code)] fn get(&self, index: usize) -> bool { assert!(index < BITS_PER_BLOCK); - let chunk_idx = index / BITS_PER_CHUNK; - let bit_idx = index % BITS_PER_CHUNK; - let mask = 1 << ((BITS_PER_CHUNK - 1) - bit_idx); + let chunk_idx = index / BITS_PER_SUB_BLOCK; + let bit_idx = index % BITS_PER_SUB_BLOCK; + let mask = 1 << ((BITS_PER_SUB_BLOCK - 1) - bit_idx); self.bits[chunk_idx] & mask != 0 } @@ -84,19 +74,13 @@ impl Block { let sub_block = local_idx / BITS_PER_SUB_BLOCK; rank += self.sub_blocks[sub_block] as usize; - if BITS_PER_CHUNK != BITS_PER_SUB_BLOCK { - for i in sub_block * CHUNKS_PER_SUB_BLOCK..local_idx / BITS_PER_CHUNK { - rank += self.bits[i].count_ones() as usize; - } - } - - let remainder = local_idx % BITS_PER_CHUNK; + let remainder = local_idx % BITS_PER_SUB_BLOCK; - let last_chunk = local_idx / BITS_PER_CHUNK; + let last_chunk = local_idx / BITS_PER_SUB_BLOCK; let masked = if remainder == 0 { 0 } else { - self.bits[last_chunk] >> (BITS_PER_CHUNK - remainder) + self.bits[last_chunk] >> (BITS_PER_SUB_BLOCK - remainder) }; rank += masked.count_ones() as usize; let select = if masked == 0 { @@ -110,7 +94,7 @@ impl Block { fn total_rank(&self) -> usize { self.sub_blocks[SUB_BLOCKS_PER_BLOCK - 1] as usize + self.rank as usize - + self.bits[(SUB_BLOCKS_PER_BLOCK - 1) * CHUNKS_PER_SUB_BLOCK..] + + self.bits[SUB_BLOCKS_PER_BLOCK - 1..] .iter() .map(|c| c.count_ones() as usize) .sum::() @@ -151,24 +135,11 @@ impl Block { } } -impl Default for Block { - fn default() -> Self { - Block { - rank: 0, - sub_blocks: [0u16; SUB_BLOCKS_PER_BLOCK], - bits: [0; BITS_PER_BLOCK / BITS_PER_CHUNK], - } - } -} - /// Builder for creating a [`BitRank`]. /// /// # Examples /// /// ```text -/// // Note: This should work as a doctest, except this module is not public. -/// let mut bytes = Vec::::new(); -/// /// let mut builder = BitRankBuilder::new(); /// builder.push(17); /// builder.push(23); @@ -179,9 +150,6 @@ impl Default for Block { #[derive(Default)] pub struct BitRankBuilder { blocks: Vec, - curr_rank: u64, - curr_block_id: usize, - curr_block: Option, } impl BitRankBuilder { @@ -190,55 +158,56 @@ impl BitRankBuilder { Self::default() } - fn push_block(&mut self, mut block: Block) -> u64 { - let mut local_rank = 0; - for (i, chunk) in block.bits.iter().enumerate() { - // If the settings are ever changed, CHUNKS_PER_SUB_BLOCK will likely no longer be 1, so - // you will need this modulo. - #[expect(clippy::modulo_one)] - if i % CHUNKS_PER_SUB_BLOCK == 0 { - block.sub_blocks[i / CHUNKS_PER_SUB_BLOCK] = local_rank; + /// Returns a builder that can hold integers with values `0..cap`. + pub fn with_capacity(cap: usize) -> Self { + Self { + blocks: Vec::with_capacity(cap.div_ceil(BITS_PER_BLOCK)), + } + } + + fn finish_last_block(&mut self) -> u64 { + if let Some(block) = self.blocks.last_mut() { + let mut local_rank = 0; + for (i, chunk) in block.bits.iter().enumerate() { + block.sub_blocks[i] = local_rank; + local_rank += chunk.count_ones() as u16; } - local_rank += chunk.count_ones() as u16; + block.rank + local_rank as u64 + } else { + 0 } - let end_rank = block.rank + local_rank as u64; - self.blocks.push(block); - end_rank } /// Adds a bit. Bits must be added in order of increasing `position`. pub fn push(&mut self, position: usize) { let block_id = position / BITS_PER_BLOCK; assert!( - self.curr_block_id <= block_id, + self.blocks.len() <= block_id + 1, "positions must be increasing!" ); - while block_id > self.curr_block_id { - let curr_block = self - .curr_block - .take() - .unwrap_or_else(|| Block::new(self.curr_rank)); - let end_rank = self.push_block(curr_block); - self.curr_rank = end_rank; - self.curr_block_id += 1; - } - match &mut self.curr_block { - None => { - let mut block = Block::new(self.curr_rank); - block.set(position % BITS_PER_BLOCK); - self.curr_block = Some(block); - } - Some(block) => { - block.set(position % BITS_PER_BLOCK); + if block_id >= self.blocks.len() { + let curr_rank = self.finish_last_block(); + while block_id >= self.blocks.len() { + // Without this declared as a `const`, rustc 1.82 creates the Block value on the + // stack first, then `memcpy`s it into `self.blocks`. + const ZERO_BLOCK: Block = Block { + rank: 0, + sub_blocks: [0; SUB_BLOCKS_PER_BLOCK], + bits: [0; SUB_BLOCKS_PER_BLOCK], + }; + self.blocks.push(ZERO_BLOCK); + self.blocks.last_mut().expect("just inserted").rank = curr_rank; } } + self.blocks + .last_mut() + .expect("just ensured there are enough blocks") + .set(position % BITS_PER_BLOCK); } /// Finishes the `BitRank` by writing the last block of data. pub fn finish(mut self) -> BitRank { - if let Some(last_block) = self.curr_block.take() { - self.push_block(last_block); - } + self.finish_last_block(); BitRank { blocks: self.blocks, } @@ -256,8 +225,8 @@ impl BitRank { /// /// # Panics /// This may panic if the values produced by `iter` are not strictly increasing. - #[allow(clippy::should_implement_trait)] #[allow(dead_code)] + #[allow(clippy::should_implement_trait)] pub fn from_iter>(iter: I) -> BitRank { let mut builder = BitRankBuilder::new(); for position in iter { @@ -457,7 +426,7 @@ mod tests { let mut rank = 0; let mut select = None; for i in 0..random_bits.capacity() { - if i % BITS_PER_CHUNK == 0 { + if i % BITS_PER_SUB_BLOCK == 0 { select = None; } assert_eq!(br.rank_select(i), (rank, select)); @@ -501,4 +470,30 @@ mod tests { } } } + + #[test] + fn test_large_gap() { + let br = BitRank::from_iter((3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17)); + for i in 1..15 { + assert_eq!(br.rank(BITS_PER_BLOCK * i), 1); + } + for i in 0..18 { + assert_eq!(br.rank(BITS_PER_BLOCK * 15 + i), 1 + i); + } + } + + #[test] + fn test_with_capacity() { + let mut b = BitRankBuilder::with_capacity(BITS_PER_BLOCK * 3 - 1); + let initial_capacity = b.blocks.capacity(); + assert!(initial_capacity >= 3); + b.push(BITS_PER_BLOCK * 3 - 2); // should not have to grow + assert_eq!(b.blocks.capacity(), initial_capacity); + + let mut b = BitRankBuilder::with_capacity(BITS_PER_BLOCK * 3 + 1); + let initial_capacity = b.blocks.capacity(); + assert!(initial_capacity >= 4); + b.push(BITS_PER_BLOCK * 3); // should not have to grow + assert_eq!(b.blocks.capacity(), initial_capacity); + } } diff --git a/crates/string-offsets/src/lib.rs b/crates/string-offsets/src/lib.rs index 70611fb..9535dc0 100644 --- a/crates/string-offsets/src/lib.rs +++ b/crates/string-offsets/src/lib.rs @@ -1,4 +1,6 @@ -//! Position calculator to convert between byte, char, and line positions. +//! Offset calculator to convert between byte, char, and line offsets in a string. +//! +//! See [`StringOffsets`] for details. #![deny(missing_docs)] use std::ops::Range; @@ -7,11 +9,17 @@ mod bitrank; use bitrank::{BitRank, BitRankBuilder}; -/// Position calculator to convert between byte, char, and line positions. +/// Offset calculator to convert between byte, char, and line offsets in a string. /// -/// Rust strings are UTF-8, but JavaScript has UTF-16 strings, while in Python, strings are -/// sequences of Unicode code points. It's therefore necessary to adjust string positions when -/// communicating across programming language boundaries. [`StringOffsets`] does these adjustments. +/// Rust strings are UTF-8, but JavaScript has UTF-16 strings, and in Python, strings are sequences +/// of Unicode code points. It's therefore necessary to adjust string offsets when communicating +/// across programming language boundaries. [`StringOffsets`] does these adjustments. +/// +/// Each `StringOffsets` value contains offset information for a single string. [Building the +/// data structure](StringOffsets::new) takes O(n) time and memory, but then each conversion is fast. +/// +/// ["UTF-8 Conversions with BitRank"](https://adaptivepatchwork.com/2023/07/10/utf-conversion/) +/// is a blog post explaining the implementation. /// /// ## Converting offsets /// @@ -25,16 +33,16 @@ use bitrank::{BitRank, BitRankBuilder}; /// - `char_pos` - Zero-based line number and `char` offset within the line. /// /// For example, [`StringOffsets::utf8_to_utf16`] converts a Rust byte offset to a number that will -/// index to the same position in a JavaScript string. Offsets are expressed as `u32` or [`Pos`] +/// index to the same position in a JavaScript string. Offsets are expressed as `usize` or [`Pos`] /// values. /// -/// All methods accept arguments that are off the end of the string (interpreting them as the end -/// of the string). +/// All methods accept arguments that are past the end of the string, interpreting them as pointing +/// to the end of the string. /// /// ## Converting ranges /// -/// Some methods translate position *ranges*. These are expressed as `Range` except for -/// `line`, which is a `u32`: +/// Some methods translate position *ranges*. These are expressed as `Range` except for +/// `line`, which is a `usize`: /// /// - `line` - Zero-based line numbers. The range a `line` refers to is the whole line, including /// the trailing newline character if any. @@ -50,26 +58,26 @@ use bitrank::{BitRank, BitRankBuilder}; /// /// ## Complexity /// -/// Most operations run in O(1) time, some require O(log n) time. The memory consumed by this data -/// structure is typically less than the memory occupied by the actual content. In the best case, -/// it requires ~25% of the content space. +/// Most operations run in O(1) time. A few require O(log n) time. The memory consumed by this +/// data structure is typically less than the memory occupied by the actual content. In the best +/// case, it requires ~45% of the content space. pub struct StringOffsets { - // Vector storing for every line the byte position at which the line starts. + /// Vector storing, for every line, the byte position at which the line starts. line_begins: Vec, - // Encoded bitrank where the rank of a byte position corresponds to the line number to which - // the byte belongs. + /// Encoded bitrank where the rank of a byte position corresponds to the line number to which + /// the byte belongs. utf8_to_line: BitRank, - // Encoded bitrank where the rank of a byte position corresponds to the char position to which - // the byte belongs. + /// Encoded bitrank where the rank of a byte position corresponds to the char position to which + /// the byte belongs. utf8_to_char: BitRank, - // Encoded bitrank where the rank of a byte position corresponds to the UTF-16 encoded word - // position to which the byte belongs. + /// Encoded bitrank where the rank of a byte position corresponds to the UTF-16 encoded word + /// position to which the byte belongs. utf8_to_utf16: BitRank, - // Marks for every line whether it only consists of whitespace characters. + /// Marks, for every line, whether it consists only of whitespace characters. whitespace_only: Vec, } @@ -77,10 +85,10 @@ pub struct StringOffsets { #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Pos { /// Zero-indexed line number. - pub line: u32, + pub line: usize, /// Zero-indexed column number. The units of this field depend on the method that produces the /// value. See [`StringOffsets::utf8_to_char_pos`], [`StringOffsets::utf8_to_utf16_pos`]. - pub col: u32, + pub col: usize, } // The actual conversion implementation between utf8, utf16, chars, and line numbers. @@ -95,114 +103,113 @@ pub struct Pos { // of the line, but `range.end` "rounds up"; and because there are many corner cases. // // E.g.: The empty character range at the end of one line cannot be distinguished from the empty -// character range at the end of the subsequent line! This ambiguity is resolved by returning the +// character range at the start of the subsequent line! This ambiguity is resolved by returning the // line which starts with the empty character range. // // Question: Consider whether we should return an empty line range in this case which would // probably be consistent from a mathematical point of view. But then we should also return empty // line ranges for empty character ranges in the middle of a line... impl StringOffsets { - /// Collects position information for the given string. + /// Create a new converter to work with offsets into the given string. pub fn new(content: &str) -> Self { new_converter(content.as_bytes()) } - /// Collects position information for a byte-string. + /// Create a new converter to work with offsets into the given byte-string. /// /// If `content` is UTF-8, this is just like [`StringOffsets::new`]. Otherwise, the - /// conversion methods involving characters will produce unspecified (but memory-safe) results. + /// conversion methods will produce unspecified (but memory-safe) results. pub fn from_bytes(content: &[u8]) -> Self { new_converter(content) } /// Returns the number of Unicode characters on the specified line. - pub fn line_chars(&self, line_number: u32) -> u32 { + pub fn line_chars(&self, line_number: usize) -> usize { let r = self.utf8s_to_chars(self.line_to_utf8s(line_number)); r.end - r.start } /// Returns the number of lines in the string. - pub fn lines(&self) -> u32 { - self.line_begins.len() as u32 - 1 + pub fn lines(&self) -> usize { + self.line_begins.len() - 1 } /// Returns true if the specified line is empty except for whitespace. - pub fn only_whitespaces(&self, line_number: u32) -> bool { + pub fn only_whitespaces(&self, line_number: usize) -> bool { self.whitespace_only - .get(line_number as usize) + .get(line_number) .copied() .unwrap_or(true) } /// Return the byte offset of the first character on the specified (zero-based) line. /// - /// If `line_number` is greater than the number of lines in the text, this returns the length - /// of the string. - pub fn line_to_utf8_begin(&self, line_number: u32) -> u32 { - self.line_begins[line_number.min(self.lines()) as usize] + /// If `line_number` is greater than or equal to the number of lines in the text, this returns + /// the length of the string. + pub fn line_to_utf8_begin(&self, line_number: usize) -> usize { + self.line_begins[line_number.min(self.lines())] as usize } - /// Python-style offset of the first character of a line. - pub fn line_to_char_begin(&self, line_number: u32) -> u32 { - self.utf8_to_char(self.line_to_utf8_begin(line_number)) + /// UTF-16 offset of the first character of a line. + /// + /// That is, return the offset that would point to the start of that line in a UTF-16 + /// representation of the source string. + pub fn line_to_utf16_begin(&self, line_number: usize) -> usize { + self.utf8_to_utf16(self.line_to_utf8_begin(line_number)) } - /// JS-style offset of the first character of a line. - pub fn line_to_utf16_begin(&self, line_number: u32) -> u32 { - self.utf8_to_utf16(self.line_to_utf8_begin(line_number)) + /// UTF-32 offset of the first character of a line. + /// + /// That is, return the offset that would point to the start of that line in a UTF-32 + /// representation of the source string. + pub fn line_to_char_begin(&self, line_number: usize) -> usize { + self.utf8_to_char(self.line_to_utf8_begin(line_number)) } - /// Rust-style offset of the first character of a line. - pub fn line_to_utf8_end(&self, line_number: u32) -> u32 { + /// UTF-8 offset of the first character of a line. + pub fn line_to_utf8_end(&self, line_number: usize) -> usize { self.line_to_utf8_begin(line_number + 1) } - /// Python-style offset one past the end of a line (the offset of the start of the next line). - pub fn line_to_char_end(&self, line_number: u32) -> u32 { - self.utf8_to_char(self.line_to_utf8_end(line_number)) + /// UTF-16 offset one past the end of a line (the offset of the start of the next line). + pub fn line_to_utf16_end(&self, line_number: usize) -> usize { + self.utf8_to_utf16(self.line_to_utf8_end(line_number)) } - /// JS-style offset one past the end of a line (the offset of the start of the next line). - pub fn line_to_utf16_end(&self, line_number: u32) -> u32 { - self.utf8_to_utf16(self.line_to_utf8_end(line_number)) + /// UTF-32 offset one past the end of a line (the offset of the start of the next line). + pub fn line_to_char_end(&self, line_number: usize) -> usize { + self.utf8_to_char(self.line_to_utf8_end(line_number)) } - /// Rust-style offset one past the end of a line (the offset of the start of the next line). - pub fn line_to_utf8s(&self, line_number: u32) -> Range { + /// UTF-8 offset one past the end of a line (the offset of the start of the next line). + pub fn line_to_utf8s(&self, line_number: usize) -> Range { self.line_to_utf8_begin(line_number)..self.line_to_utf8_end(line_number) } - /// Python-style offsets for the beginning and end of a line, including the newline if any. - pub fn line_to_chars(&self, line_number: u32) -> Range { + /// UTF-32 offsets for the beginning and end of a line, including the newline if any. + pub fn line_to_chars(&self, line_number: usize) -> Range { self.utf8s_to_chars(self.line_to_utf8s(line_number)) } - /// Rust-style offsets for the beginning and end of a line, including the newline if any. - pub fn lines_to_utf8s(&self, line_numbers: Range) -> Range { + /// UTF-8 offsets for the beginning and end of a range of lines, including the newline if any. + pub fn lines_to_utf8s(&self, line_numbers: Range) -> Range { self.line_to_utf8_begin(line_numbers.start)..self.line_to_utf8_begin(line_numbers.end) } - /// Python-style offsets for the beginning and end of a range of lines, including the newline - /// of the last line, if any. - pub fn lines_to_chars(&self, line_numbers: Range) -> Range { + /// UTF-32 offsets for the beginning and end of a range of lines, including the newline if any. + pub fn lines_to_chars(&self, line_numbers: Range) -> Range { self.utf8s_to_chars(self.lines_to_utf8s(line_numbers)) } - /// Return the range of line numbers containing the substring specified by the Python-style - /// range `chars`. Newline characters count as part of the preceding line. - pub fn chars_to_lines(&self, chars: Range) -> Range { - self.utf8s_to_lines(self.chars_to_utf8s(chars)) - } - - /// Return the zero-based line number of the line containing the specified Rust-style offset. + /// Return the zero-based line number of the line containing the specified UTF-8 offset. /// Newline characters count as part of the preceding line. - pub fn utf8_to_line(&self, byte_number: u32) -> u32 { - self.utf8_to_line.rank(byte_number as usize) as u32 + pub fn utf8_to_line(&self, byte_number: usize) -> usize { + self.utf8_to_line.rank(byte_number) } - /// Converts a Rust-style offset to a zero-based line number and Python-style offset within the + /// Converts a UTF-8 offset to a zero-based line number and UTF-32 offset within the /// line. - pub fn utf8_to_char_pos(&self, byte_number: u32) -> Pos { + pub fn utf8_to_char_pos(&self, byte_number: usize) -> Pos { let line = self.utf8_to_line(byte_number); let line_start_char_number = self.line_to_char_begin(line); let char_idx = self.utf8_to_char(byte_number); @@ -212,9 +219,9 @@ impl StringOffsets { } } - /// Converts a Rust-style offset to a zero-based line number and JS-style offset within the + /// Converts a UTF-8 offset to a zero-based line number and UTF-16 offset within the /// line. - pub fn utf8_to_utf16_pos(&self, byte_number: u32) -> Pos { + pub fn utf8_to_utf16_pos(&self, byte_number: usize) -> Pos { let line = self.utf8_to_line(byte_number); let line_start_char_number = self.line_to_utf16_begin(line); let char_idx = self.utf8_to_utf16(byte_number); @@ -230,7 +237,7 @@ impl StringOffsets { /// If `bytes` is an empty range at a position within or at the beginning of a line, this /// returns a nonempty range containing the line number of that one line. An empty range at or /// beyond the end of the string translates to an empty range of line numbers. - pub fn utf8s_to_lines(&self, bytes: Range) -> Range { + pub fn utf8s_to_lines(&self, bytes: Range) -> Range { // The fiddly parts of this formula are necessary because `bytes.start` rounds down to the // beginning of the line, but `bytes.end` "rounds up" to the end of the line. the final // `+1` is to produce a half-open range. @@ -240,18 +247,24 @@ impl StringOffsets { .min(self.utf8_to_line(bytes.end.saturating_sub(1).max(bytes.start)) + 1) } - /// Converts a Rust-style offset to Python style. - pub fn utf8_to_char(&self, byte_number: u32) -> u32 { - self.utf8_to_char.rank(byte_number as usize) as u32 + /// Returns the range of line numbers containing the substring specified by the UTF-32 + /// range `chars`. Newline characters count as part of the preceding line. + pub fn chars_to_lines(&self, chars: Range) -> Range { + self.utf8s_to_lines(self.chars_to_utf8s(chars)) + } + + /// Converts a UTF-8 offset to a UTF-32 offset. + pub fn utf8_to_char(&self, byte_number: usize) -> usize { + self.utf8_to_char.rank(byte_number) } - /// Converts a Rust-style offset to JS style. - pub fn utf8_to_utf16(&self, byte_number: u32) -> u32 { - self.utf8_to_utf16.rank(byte_number as usize) as u32 + /// Converts a UTF-8 offset to a UTF-16 offset. + pub fn utf8_to_utf16(&self, byte_number: usize) -> usize { + self.utf8_to_utf16.rank(byte_number) } - /// Converts a Python-style offset to Rust style. - pub fn char_to_utf8(&self, char_number: u32) -> u32 { + /// Converts a UTF-32 offset to a UTF-8 offset. + pub fn char_to_utf8(&self, char_number: usize) -> usize { let mut byte_number = char_number; for _ in 0..128 { let char_number2 = self.utf8_to_char(byte_number); @@ -263,14 +276,15 @@ impl StringOffsets { // If we couldn't find the char within 128 steps, then the char_number might be invalid! // This does not usually happen. For consistency with the rest of the code, we simply return // the max utf8 position in this case. - if char_number > self.utf8_to_char.max_rank() as u32 { + if char_number > self.utf8_to_char.max_rank() { return self .line_begins .last() .copied() - .expect("last entry represents the length of the file!"); + .expect("last entry represents the length of the file!") + as usize; } - let limit = *self.line_begins.last().expect("no line begins"); + let limit = *self.line_begins.last().expect("no line begins") as usize; // Otherwise, we keep searching, but are a bit more careful and add a check that we don't run into an infinite loop. loop { let char_number2 = self.utf8_to_char(byte_number); @@ -282,21 +296,22 @@ impl StringOffsets { } } - /// Converts a Rust-style offset range to Python style. - pub fn utf8s_to_chars(&self, bytes: Range) -> Range { + /// Converts a UTF-8 offset range to a UTF-32 offset range. + pub fn utf8s_to_chars(&self, bytes: Range) -> Range { self.utf8_to_char(bytes.start)..self.utf8_to_char(bytes.end) } - /// Converts a Python-style offset range to Rust style. - pub fn chars_to_utf8s(&self, chars: Range) -> Range { + /// Converts a UTF-32 offset range to a UTF-8 offset range. + pub fn chars_to_utf8s(&self, chars: Range) -> Range { self.char_to_utf8(chars.start)..self.char_to_utf8(chars.end) } } fn new_converter(content: &[u8]) -> StringOffsets { - let mut utf8_builder = BitRankBuilder::new(); - let mut utf16_builder = BitRankBuilder::new(); - let mut line_builder = BitRankBuilder::new(); + let n = content.len(); + let mut utf8_builder = BitRankBuilder::with_capacity(n); + let mut utf16_builder = BitRankBuilder::with_capacity(n); + let mut line_builder = BitRankBuilder::with_capacity(n); let mut line_begins = vec![0]; let mut i = 0; let mut whitespace_only = vec![]; @@ -343,22 +358,8 @@ fn new_converter(content: &[u8]) -> StringOffsets { } } -/// Returns true if, in a UTF-8 string, `b` always indicates the first byte of a character. -/// -/// (This is true for bytes `0..=127` and `192..=255`.) -pub fn is_char_boundary(b: u8) -> bool { - // Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128 - // Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192 - // The rest are bytes belonging to the first byte of multi byte encodings (11xxxxxx): b >= 192 - // - // When interpreting the byte representation as signed integers, then numbers in the range - // 128..192 correspond to the smallest representable numbers. I.e. the two ranges [0, 128) and - // [192, 256) can be tested with a single signed comparison. - b as i8 >= -0x40 // NB: b < 128 || b >= 192 -} - -/// Returns the number of bytes this utf8 char occupies given the first byte of the utf8 encoding. -/// Returns 0 if the byte is not a valid first byte of a utf8 char. +/// Returns the number of bytes a UTF-8 char occupies, given the first byte of the UTF-8 encoding. +/// Returns 0 if the byte is not a valid first byte of a UTF-8 char. fn utf8_width(c: u8) -> usize { // Every nibble represents the utf8 length given the first 4 bits of a utf8 encoded byte. const UTF8_WIDTH: usize = 0x4322_0000_1111_1111; @@ -376,9 +377,13 @@ fn utf8_to_utf16_width(content: &[u8]) -> usize { } #[cfg(test)] -mod test { - use super::is_char_boundary; - use crate::{utf8_to_utf16_width, utf8_width, Pos, StringOffsets}; +mod tests { + use super::*; + + /// Returns true if, in a UTF-8 string, `b` indicates the first byte of a character. + fn is_char_boundary(b: u8) -> bool { + b as i8 >= -0x40 // NB: b < 128 || b >= 192 + } #[test] fn test_utf8_char_width() { @@ -455,7 +460,7 @@ no terminating newline!"#; assert_eq!(lines.utf8s_to_lines(63..63), 3..3); } - fn pos(line: u32, col: u32) -> Pos { + fn pos(line: usize, col: usize) -> Pos { Pos { line, col } } @@ -511,13 +516,13 @@ line1 let mut utf16_index = 0; let mut char_index = 0; for (byte_index, char) in content.char_indices() { - assert_eq!(lines.utf8_to_char(byte_index as u32), char_index); - assert_eq!(lines.utf8_to_utf16(byte_index as u32), utf16_index); + assert_eq!(lines.utf8_to_char(byte_index), char_index); + assert_eq!(lines.utf8_to_utf16(byte_index), utf16_index); char_index += 1; - utf16_index += char.len_utf16() as u32; + utf16_index += char.len_utf16(); } - assert_eq!(lines.utf8_to_char(content.len() as u32), char_index); - assert_eq!(lines.utf8_to_utf16(content.len() as u32), utf16_index); + assert_eq!(lines.utf8_to_char(content.len()), char_index); + assert_eq!(lines.utf8_to_utf16(content.len()), utf16_index); } #[test] From db9b2cb11b34ec3af3bc84daf8fed341a3d1a0a5 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Fri, 8 Nov 2024 17:03:22 -0600 Subject: [PATCH 05/11] renaming intensifies --- crates/string-offsets/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/string-offsets/README.md b/crates/string-offsets/README.md index 668d35d..e7eb5d5 100644 --- a/crates/string-offsets/README.md +++ b/crates/string-offsets/README.md @@ -1,4 +1,4 @@ -# UTF-8 Converter +# string-offsets This crate converts string positions between Rust style (UTF-8 byte offsets) and styles used by other programming languages, as well as line numbers. @@ -8,5 +8,5 @@ Add this to your `Cargo.toml`: ```toml [dependencies] -utf8-converter = "0.1" +string-offsets = "0.1" ``` From fd056fbac6d422e0ef7fca58fed03c9b9bc5079a Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Tue, 12 Nov 2024 09:49:24 -0600 Subject: [PATCH 06/11] update readme after renaming --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8d42c06..a97abe3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A collection of useful algorithms written in Rust. Currently contains: - [`geo_filters`](crates/geo_filters): probabilistic data structures that solve the [Distinct Count Problem](https://en.wikipedia.org/wiki/Count-distinct_problem) using geometric filters. - [`bpe`](crates/bpe): fast, correct, and novel algorithms for the [Byte Pair Encoding Algorithm](https://en.wikipedia.org/wiki/Large_language_model#BPE) which are particularly useful for chunking of documents. -- [`utf8-converter`](crates/utf8-converter): converts string positions between bytes, chars, UTF-16 code units, and line numbers. Useful when sending string indices across language boundaries. +- [`string-offsets`](crates/string-offsets): converts string positions between bytes, chars, UTF-16 code units, and line numbers. Useful when sending string indices across language boundaries. ## Background From ef3257565c9d099eddd9f2d379566563654eae34 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Tue, 12 Nov 2024 11:20:24 -0600 Subject: [PATCH 07/11] Flesh out string-offsets README --- crates/string-offsets/README.md | 35 +++++++++++++++++++++++++++++++- crates/string-offsets/src/lib.rs | 22 ++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/crates/string-offsets/README.md b/crates/string-offsets/README.md index e7eb5d5..fd04fc6 100644 --- a/crates/string-offsets/README.md +++ b/crates/string-offsets/README.md @@ -1,6 +1,16 @@ # string-offsets -This crate converts string positions between Rust style (UTF-8 byte offsets) and styles used by other programming languages, as well as line numbers. +Offset calculator to convert between byte, char, and line offsets in a string. + +Rust strings are UTF-8, but JavaScript has UTF-16 strings, and in Python, strings are sequences of +Unicode code points. It's therefore necessary to adjust string offsets when communicating across +programming language boundaries. [`StringOffsets`] does these adjustments. + +Each `StringOffsets` value contains offset information for a single string. [Building the data +structure](StringOffsets::new) takes O(n) time and memory, but then each conversion is fast. + +["UTF-8 Conversions with BitRank"](https://adaptivepatchwork.com/2023/07/10/utf-conversion/) is a +blog post explaining the implementation. ## Usage @@ -10,3 +20,26 @@ Add this to your `Cargo.toml`: [dependencies] string-offsets = "0.1" ``` + +Then: + +```rust +use string_offsets::StringOffsets; + +let s = "☀️hello\n🗺️world\n"; +let offsets = StringOffsets::new(s); + +// Find offsets where lines begin and end. +assert_eq!(offsets.line_to_utf8s(0), 0..12); // note: 0-based line numbers + +// Translate string offsets between UTF-8 and other encodings. +// This map emoji is 7 UTF-8 bytes... +assert_eq!(&s[12..19], "🗺️"); +// ...but only 3 UTF-16 code units... +assert_eq!(offsets.utf8_to_utf16(12), 8); +assert_eq!(offsets.utf8_to_utf16(19), 11); +// ...and only 2 Unicode characters. +assert_eq!(offsets.utf8s_to_chars(12..19), 8..10); +``` + +See [the documentation](https://docs.rs/string-offsets/latest/string_offsets/struct.StringOffsets.html) for more. diff --git a/crates/string-offsets/src/lib.rs b/crates/string-offsets/src/lib.rs index 9535dc0..02ea569 100644 --- a/crates/string-offsets/src/lib.rs +++ b/crates/string-offsets/src/lib.rs @@ -1,5 +1,27 @@ //! Offset calculator to convert between byte, char, and line offsets in a string. //! +//! +//! # Example +//! +//! ``` +//! use string_offsets::StringOffsets; +//! +//! let s = "☀️hello\n🗺️world\n"; +//! let offsets = StringOffsets::new(s); +//! +//! // Find offsets where lines begin and end. +//! assert_eq!(offsets.line_to_utf8s(0), 0..12); // note: 0-based line numbers +//! +//! // Translate string offsets between UTF-8 and other encodings. +//! // This map emoji is 7 UTF-8 bytes... +//! assert_eq!(&s[12..19], "🗺️"); +//! // ...but only 3 UTF-16 code units... +//! assert_eq!(offsets.utf8_to_utf16(12), 8); +//! assert_eq!(offsets.utf8_to_utf16(19), 11); +//! // ...and only 2 Unicode characters. +//! assert_eq!(offsets.utf8s_to_chars(12..19), 8..10); +//! ``` +//! //! See [`StringOffsets`] for details. #![deny(missing_docs)] From 2fcc9034bbeb030224ea261f656dcad5f239e7b6 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Tue, 12 Nov 2024 13:09:42 -0600 Subject: [PATCH 08/11] last-minute polishing --- crates/string-offsets/Cargo.toml | 9 +++++++-- crates/string-offsets/src/lib.rs | 1 - 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/crates/string-offsets/Cargo.toml b/crates/string-offsets/Cargo.toml index 842b7f7..dabf4d3 100644 --- a/crates/string-offsets/Cargo.toml +++ b/crates/string-offsets/Cargo.toml @@ -1,8 +1,13 @@ [package] -authors = ["The blackbird team "] -edition = "2021" name = "string-offsets" +authors = ["The blackbird team "] version = "0.1.0" +edition = "2021" +description = "Offset calculator to convert between byte, char, and line offsets in a string." +repository = "https://github.com/github/rust-gems" +license = "MIT" +keywords = ["unicode", "string", "offsets", "positions", "interoperability"] +categories = ["algorithms", "data-structures", "text-processing", "development-tools::ffi"] [dev-dependencies] itertools = "0.13" diff --git a/crates/string-offsets/src/lib.rs b/crates/string-offsets/src/lib.rs index 02ea569..a24d45c 100644 --- a/crates/string-offsets/src/lib.rs +++ b/crates/string-offsets/src/lib.rs @@ -1,6 +1,5 @@ //! Offset calculator to convert between byte, char, and line offsets in a string. //! -//! //! # Example //! //! ``` From bfaa2dee4e8bed3fdfeb1b99b04efe01685ac567 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Tue, 12 Nov 2024 13:18:48 -0600 Subject: [PATCH 09/11] remove dead code --- crates/string-offsets/src/bitrank.rs | 181 ++++----------------------- 1 file changed, 26 insertions(+), 155 deletions(-) diff --git a/crates/string-offsets/src/bitrank.rs b/crates/string-offsets/src/bitrank.rs index 88c77cb..6524769 100644 --- a/crates/string-offsets/src/bitrank.rs +++ b/crates/string-offsets/src/bitrank.rs @@ -1,9 +1,6 @@ //! A bit-vector data structure, optimized for //! [rank](http://bitmagic.io/rank-select.html) operations. //! -//! There is also an opportunistic `select` operation, but the general case has not been -//! implemented. -//! //! See also: ["Succinct data structure"](https://en.wikipedia.org/wiki/Succinct_data_structure). type SubblockBits = u128; @@ -55,15 +52,6 @@ impl Block { self.bits[chunk_idx] ^= mask; } - /// Tests whether the bit at the given index is set. - fn get(&self, index: usize) -> bool { - assert!(index < BITS_PER_BLOCK); - let chunk_idx = index / BITS_PER_SUB_BLOCK; - let bit_idx = index % BITS_PER_SUB_BLOCK; - let mask = 1 << ((BITS_PER_SUB_BLOCK - 1) - bit_idx); - self.bits[chunk_idx] & mask != 0 - } - /// The **total rank** of the block relative local index, and the index of the one /// bit that establishes that rank (aka "select") **if** it occurs within that same /// chunk, otherwise ['None']. The assumption is that if you would have to look back @@ -99,40 +87,6 @@ impl Block { .map(|c| c.count_ones() as usize) .sum::() } - - fn predecessor(&self, idx: usize) -> Option { - let sub_block = idx / BITS_PER_SUB_BLOCK; - let masked = self.bits[sub_block] >> (BITS_PER_SUB_BLOCK - 1 - idx % BITS_PER_SUB_BLOCK); - if masked > 0 { - Some(idx - masked.trailing_zeros() as usize) - } else { - for i in (0..sub_block).rev() { - let masked = self.bits[i]; - if masked > 0 { - return Some( - (i + 1) * BITS_PER_SUB_BLOCK - masked.trailing_zeros() as usize - 1, - ); - } - } - None - } - } - - fn successor(&self, idx: usize) -> Option { - let sub_block = idx / BITS_PER_SUB_BLOCK; - let masked = self.bits[sub_block] << (idx % BITS_PER_SUB_BLOCK); - if masked > 0 { - Some(idx + masked.leading_zeros() as usize) - } else { - for i in (sub_block + 1)..SUB_BLOCKS_PER_BLOCK { - let masked = self.bits[i]; - if masked > 0 { - return Some(i * BITS_PER_SUB_BLOCK + masked.leading_zeros() as usize); - } - } - None - } - } } /// Builder for creating a [`BitRank`]. @@ -154,6 +108,7 @@ pub struct BitRankBuilder { impl BitRankBuilder { /// Returns a new builder. + #[cfg(test)] pub fn new() -> Self { Self::default() } @@ -221,20 +176,6 @@ pub struct BitRank { } impl BitRank { - /// Creates a `BitRank` containing the integers in `iter`. - /// - /// # Panics - /// This may panic if the values produced by `iter` are not strictly increasing. - #[allow(dead_code)] - #[allow(clippy::should_implement_trait)] - pub fn from_iter>(iter: I) -> BitRank { - let mut builder = BitRankBuilder::new(); - for position in iter { - builder.push(position); - } - builder.finish() - } - /// The rank at the specified index (exclusive). /// /// The (one) rank is defined as: `rank(i) = sum(b[j] for j in 0..i)` @@ -243,51 +184,6 @@ impl BitRank { self.rank_select(idx).0 } - /// Tests whether the bit at the given index is set. - #[allow(dead_code)] - pub fn get(&self, idx: usize) -> bool { - let block_num = idx / BITS_PER_BLOCK; - // assert!(block_num < self.blocks.len(), "index out of bounds"); - if block_num >= self.blocks.len() { - false - } else { - self.blocks[block_num].get(idx % BITS_PER_BLOCK) - } - } - - /// Returns the 1 bit at or before the specified index. - #[allow(dead_code)] - pub fn predecessor(&self, idx: usize) -> usize { - let block_num = idx / BITS_PER_BLOCK; - if block_num < self.blocks.len() { - if let Some(p) = self.blocks[block_num].predecessor(idx % BITS_PER_BLOCK) { - return block_num * BITS_PER_BLOCK + p; - } - } - for block_num in (0..self.blocks.len().min(block_num)).rev() { - if let Some(p) = self.blocks[block_num].predecessor(BITS_PER_BLOCK - 1) { - return block_num * BITS_PER_BLOCK + p; - } - } - panic!("no predecessor found!"); - } - - /// Returns the next 1 bit at or after the specified index. - #[allow(dead_code)] - pub fn successor(&self, idx: usize) -> usize { - let block_num = idx / BITS_PER_BLOCK; - if let Some(s) = self.blocks[block_num].successor(idx % BITS_PER_BLOCK) { - s + block_num * BITS_PER_BLOCK - } else { - for block_num in block_num + 1..self.blocks.len() { - if let Some(p) = self.blocks[block_num].successor(0) { - return block_num * BITS_PER_BLOCK + p; - } - } - panic!("no successor found!"); - } - } - /// Returns the number of elements in the set. pub fn max_rank(&self) -> usize { self.blocks @@ -314,58 +210,55 @@ impl BitRank { (rank, b_idx.map(|i| (block_num * BITS_PER_BLOCK) + i)) } } - - /// The total size of the bit vec that was allocated. - /// **Note:** This is more like capacity than normal `len` in that it does not - /// consider how much of the bit vec is actually used. - #[allow(dead_code)] - pub fn capacity(&self) -> usize { - self.blocks.len() * BITS_PER_BLOCK - } } #[cfg(test)] mod tests { - use itertools::Itertools; use rand::distributions::Uniform; use rand::prelude::*; use rand_chacha::ChaCha8Rng; use super::*; - fn write(positions: &[usize]) -> BitRank { - BitRank::from_iter(positions.iter().copied()) + /// Creates a `BitRank` containing the integers in `iter` (which should be strictly + /// increasing). + pub fn bitrank>(iter: I) -> BitRank { + let mut builder = BitRankBuilder::new(); + for position in iter { + builder.push(position); + } + builder.finish() } #[test] fn test_rank_zero() { - let br = BitRank::from_iter([0]); + let br = bitrank([0]); assert_eq!(br.rank(0), 0); assert_eq!(br.rank(1), 1); } #[test] fn test_empty() { - let br = BitRank::from_iter([]); + let br = bitrank([]); assert!(br.blocks.is_empty()); } #[test] fn test_index_out_of_bounds() { - let br = BitRank::from_iter([BITS_PER_BLOCK - 1]); + let br = bitrank([BITS_PER_BLOCK - 1]); assert_eq!(br.rank(BITS_PER_BLOCK), 1); } #[test] #[should_panic] fn test_duplicate_position() { - write(&[64, 66, 68, 68, 90]); + bitrank([64, 66, 68, 68, 90]); } #[test] fn test_rank_exclusive() { - let br = BitRank::from_iter(0..132); - assert_eq!(br.capacity(), BITS_PER_BLOCK); + let br = bitrank(0..132); + assert_eq!(br.blocks.len(), 1); assert_eq!(br.rank(64), 64); assert_eq!(br.rank(132), 132); } @@ -374,15 +267,13 @@ mod tests { fn test_rank() { let mut positions: Vec = (0..132).collect(); positions.append(&mut vec![138usize, 140, 146]); - let br = write(&positions); + let br = bitrank(positions); assert_eq!(br.rank(135), 132); - let bits2: Vec = (0..BITS_PER_BLOCK - 5).collect(); - let br2 = write(&bits2); + let br2 = bitrank(0..BITS_PER_BLOCK - 5); assert_eq!(br2.rank(169), 169); - let bits3: Vec = (0..BITS_PER_BLOCK + 5).collect(); - let br3 = write(&bits3); + let br3 = bitrank(0..BITS_PER_BLOCK + 5); assert_eq!(br3.rank(BITS_PER_BLOCK), BITS_PER_BLOCK); } @@ -390,23 +281,23 @@ mod tests { fn test_rank_idx() { let mut positions: Vec = (0..132).collect(); positions.append(&mut vec![138usize, 140, 146]); - let br = write(&positions); + let br = bitrank(positions); assert_eq!(br.rank_select(135), (132, Some(131))); let bits2: Vec = (0..BITS_PER_BLOCK - 5).collect(); - let br2 = write(&bits2); + let br2 = bitrank(bits2); assert_eq!(br2.rank_select(169), (169, Some(168))); let bits3: Vec = (0..BITS_PER_BLOCK + 5).collect(); - let br3 = write(&bits3); + let br3 = bitrank(bits3); assert_eq!(br3.rank_select(BITS_PER_BLOCK), (BITS_PER_BLOCK, None)); let bits4: Vec = vec![1, 1000, 9999, BITS_PER_BLOCK + 1]; - let br4 = write(&bits4); + let br4 = bitrank(bits4); assert_eq!(br4.rank_select(10000), (3, Some(9999))); let bits5: Vec = vec![1, 1000, 9999, BITS_PER_BLOCK + 1]; - let br5 = write(&bits5); + let br5 = bitrank(bits5); assert_eq!(br5.rank_select(BITS_PER_BLOCK), (3, None)); } @@ -422,7 +313,7 @@ mod tests { // This isn't strictly necessary, given that the bit would just be toggled again, but it // ensures that we are meeting the contract. random_bits.dedup(); - let br = write(&random_bits); + let br = bitrank(random_bits.iter().copied()); let mut rank = 0; let mut select = None; for i in 0..random_bits.capacity() { @@ -442,7 +333,7 @@ mod tests { #[test] fn test_rank_out_of_bounds() { for i in 1..30 { - let br = write(&[BITS_PER_BLOCK * i - 1]); + let br = bitrank([BITS_PER_BLOCK * i - 1]); assert_eq!(br.max_rank(), 1); assert_eq!(br.rank(BITS_PER_BLOCK * i - 1), 0); for j in 0..10 { @@ -451,29 +342,9 @@ mod tests { } } - #[test] - fn test_predecessor_and_successor() { - let mut rng = ChaCha8Rng::seed_from_u64(2); - let uniform = Uniform::::from(0..1_000_000); - let mut random_bits = Vec::with_capacity(100_000); - for _ in 0..100_000 { - random_bits.push(uniform.sample(&mut rng)); - } - random_bits.sort_unstable(); - random_bits.dedup(); - let br = write(&random_bits); - - for (i, j) in random_bits.iter().copied().tuple_windows() { - for k in i..j { - assert_eq!(br.successor(k + 1), j, "{i} {k} {j}"); - assert_eq!(br.predecessor(k), i, "{i} {k} {j}"); - } - } - } - #[test] fn test_large_gap() { - let br = BitRank::from_iter((3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17)); + let br = bitrank((3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17)); for i in 1..15 { assert_eq!(br.rank(BITS_PER_BLOCK * i), 1); } From 3cecb0b23a8408ccc8300337d0539ba8070c8fd7 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Wed, 13 Nov 2024 11:32:41 -0600 Subject: [PATCH 10/11] address review comments --- crates/string-offsets/Cargo.toml | 4 ++-- crates/string-offsets/README.md | 6 +++--- crates/string-offsets/src/lib.rs | 12 +++++++----- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/crates/string-offsets/Cargo.toml b/crates/string-offsets/Cargo.toml index dabf4d3..7d60dd2 100644 --- a/crates/string-offsets/Cargo.toml +++ b/crates/string-offsets/Cargo.toml @@ -3,10 +3,10 @@ name = "string-offsets" authors = ["The blackbird team "] version = "0.1.0" edition = "2021" -description = "Offset calculator to convert between byte, char, and line offsets in a string." +description = "Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines." repository = "https://github.com/github/rust-gems" license = "MIT" -keywords = ["unicode", "string", "offsets", "positions", "interoperability"] +keywords = ["unicode", "positions", "utf16", "characters", "lines"] categories = ["algorithms", "data-structures", "text-processing", "development-tools::ffi"] [dev-dependencies] diff --git a/crates/string-offsets/README.md b/crates/string-offsets/README.md index fd04fc6..7ad8c23 100644 --- a/crates/string-offsets/README.md +++ b/crates/string-offsets/README.md @@ -1,13 +1,13 @@ # string-offsets -Offset calculator to convert between byte, char, and line offsets in a string. +Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines. Rust strings are UTF-8, but JavaScript has UTF-16 strings, and in Python, strings are sequences of Unicode code points. It's therefore necessary to adjust string offsets when communicating across programming language boundaries. [`StringOffsets`] does these adjustments. -Each `StringOffsets` value contains offset information for a single string. [Building the data -structure](StringOffsets::new) takes O(n) time and memory, but then each conversion is fast. +Each `StringOffsets` instance contains offset information for a single string. [Building the data +structure](StringOffsets::new) takes O(n) time and memory, but then most conversions are O(1). ["UTF-8 Conversions with BitRank"](https://adaptivepatchwork.com/2023/07/10/utf-conversion/) is a blog post explaining the implementation. diff --git a/crates/string-offsets/src/lib.rs b/crates/string-offsets/src/lib.rs index a24d45c..ee05e54 100644 --- a/crates/string-offsets/src/lib.rs +++ b/crates/string-offsets/src/lib.rs @@ -1,4 +1,4 @@ -//! Offset calculator to convert between byte, char, and line offsets in a string. +//! Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines. //! //! # Example //! @@ -17,7 +17,7 @@ //! // ...but only 3 UTF-16 code units... //! assert_eq!(offsets.utf8_to_utf16(12), 8); //! assert_eq!(offsets.utf8_to_utf16(19), 11); -//! // ...and only 2 Unicode characters. +//! // ...and only 2 Unicode code points. //! assert_eq!(offsets.utf8s_to_chars(12..19), 8..10); //! ``` //! @@ -30,14 +30,16 @@ mod bitrank; use bitrank::{BitRank, BitRankBuilder}; -/// Offset calculator to convert between byte, char, and line offsets in a string. +/// Converts positions within a given string between UTF-8 byte offsets (the usual in Rust), UTF-16 +/// code units, Unicode code points, and line numbers. /// /// Rust strings are UTF-8, but JavaScript has UTF-16 strings, and in Python, strings are sequences /// of Unicode code points. It's therefore necessary to adjust string offsets when communicating /// across programming language boundaries. [`StringOffsets`] does these adjustments. /// -/// Each `StringOffsets` value contains offset information for a single string. [Building the -/// data structure](StringOffsets::new) takes O(n) time and memory, but then each conversion is fast. +/// Each `StringOffsets` instance contains offset information for a single string. [Building the +/// data structure](StringOffsets::new) takes O(n) time and memory, but then most conversions are +/// O(1). /// /// ["UTF-8 Conversions with BitRank"](https://adaptivepatchwork.com/2023/07/10/utf-conversion/) /// is a blog post explaining the implementation. From a2735a452e74dc3c0a7b4e3debaeebcbfa7962d1 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Wed, 13 Nov 2024 11:46:43 -0600 Subject: [PATCH 11/11] Remove obsolete dev-dependency on itertools --- crates/string-offsets/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/string-offsets/Cargo.toml b/crates/string-offsets/Cargo.toml index 7d60dd2..fd9b838 100644 --- a/crates/string-offsets/Cargo.toml +++ b/crates/string-offsets/Cargo.toml @@ -10,6 +10,5 @@ keywords = ["unicode", "positions", "utf16", "characters", "lines"] categories = ["algorithms", "data-structures", "text-processing", "development-tools::ffi"] [dev-dependencies] -itertools = "0.13" rand = "0.8" rand_chacha = "0.3"