From b5a2f5cf9a627c9e7977a7ceccec40516387c77f Mon Sep 17 00:00:00 2001
From: Jason Orendorff <jorendorff@github.com>
Date: Fri, 1 Nov 2024 16:15:41 -0500
Subject: [PATCH 01/11] Add `utf8-converter`.

---
 README.md                            |   1 +
 crates/utf8-converter/Cargo.toml     |  10 +
 crates/utf8-converter/README.md      |  12 +
 crates/utf8-converter/src/bitrank.rs | 504 +++++++++++++++++++++++
 crates/utf8-converter/src/lib.rs     | 577 +++++++++++++++++++++++++++
 5 files changed, 1104 insertions(+)
 create mode 100644 crates/utf8-converter/Cargo.toml
 create mode 100644 crates/utf8-converter/README.md
 create mode 100644 crates/utf8-converter/src/bitrank.rs
 create mode 100644 crates/utf8-converter/src/lib.rs

diff --git a/README.md b/README.md
index 6232e9d..8d42c06 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@ A collection of useful algorithms written in Rust. Currently contains:
 
 - [`geo_filters`](crates/geo_filters): probabilistic data structures that solve the [Distinct Count Problem](https://en.wikipedia.org/wiki/Count-distinct_problem) using geometric filters.
 - [`bpe`](crates/bpe): fast, correct, and novel algorithms for the [Byte Pair Encoding Algorithm](https://en.wikipedia.org/wiki/Large_language_model#BPE) which are particularly useful for chunking of documents.
+- [`utf8-converter`](crates/utf8-converter): converts string positions between bytes, chars, UTF-16 code units, and line numbers. Useful when sending string indices across language boundaries.
 
 ## Background
 
diff --git a/crates/utf8-converter/Cargo.toml b/crates/utf8-converter/Cargo.toml
new file mode 100644
index 0000000..49da760
--- /dev/null
+++ b/crates/utf8-converter/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+authors = ["The blackbird team <support@github.com>"]
+edition = "2021"
+name = "utf8-converter"
+version = "0.1.0"
+
+[dependencies]
+itertools = "0.13"
+rand = "0.8"
+rand_chacha = "0.3"
diff --git a/crates/utf8-converter/README.md b/crates/utf8-converter/README.md
new file mode 100644
index 0000000..668d35d
--- /dev/null
+++ b/crates/utf8-converter/README.md
@@ -0,0 +1,12 @@
+# UTF-8 Converter
+
+This crate converts string positions between Rust style (UTF-8 byte offsets) and styles used by other programming languages, as well as line numbers.
+
+## Usage
+
+Add this to your `Cargo.toml`:
+
+```toml
+[dependencies]
+utf8-converter = "0.1"
+```
diff --git a/crates/utf8-converter/src/bitrank.rs b/crates/utf8-converter/src/bitrank.rs
new file mode 100644
index 0000000..fc2fa87
--- /dev/null
+++ b/crates/utf8-converter/src/bitrank.rs
@@ -0,0 +1,504 @@
+//! A bit-vector data structure, optimized for
+//! [rank](http://bitmagic.io/rank-select.html) operations.
+//!
+//! There is also an opportunistic `select` operation, but the general case has not been
+//! implemented.
+
+type Chunk = u128;
+
+// Static sizing of the various components of the data structure.
+const BITS_PER_BLOCK: usize = 16384;
+const BITS_PER_SUB_BLOCK: usize = 128;
+const SUB_BLOCKS_PER_BLOCK: usize = BITS_PER_BLOCK / BITS_PER_SUB_BLOCK;
+const BITS_PER_CHUNK: usize = 128;
+const CHUNKS_PER_SUB_BLOCK: usize = BITS_PER_SUB_BLOCK / BITS_PER_CHUNK;
+
+/// A container for a portion of the total bit vector and the associated indices.
+/// The bits within each chunk are stored from most significant bit (msb) to least significant bit (lsb).
+/// i.e. index 0 of a Chunk is at the start of visual binary representation or a value of
+/// 1u128 << 127.
+///
+/// The actual bits are stored alongside the indices because the common case will be reading this
+/// information from disk (rather than random access memory), so it is beneficial to have all of
+/// the data that we need in the same page.
+///
+/// ```text
+/// index:           [ 0, 1, 2, 3, 4, 5, 6, 7 ]
+/// bits:            [ 0, 1, 0, 1, 1, 0, 1, 0 ]
+/// rank(exclusive): [ 0, 0, 1, 1, 2, 3, 3, 4 ]
+/// block rank:      [           0            ]
+/// sub-block rank:  [     0     ][     2     ]
+/// ```
+#[derive(Clone, Debug)]
+#[repr(C)]
+struct Block {
+    /// Rank of the first bit in this block (that is, the number of bits set in previous blocks).
+    rank: u64,
+    /// Rank of the first bit (bit 0) of each subblock, relative to the start of the block.
+    /// That is, `sub_blocks[i]` is the number of bits set in the `bits` representing
+    /// sub-blocks `0..i`. `sub_blocks[0]` is always zero.
+    sub_blocks: [u16; SUB_BLOCKS_PER_BLOCK],
+    /// The bit-vector.
+    bits: [Chunk; BITS_PER_BLOCK / BITS_PER_CHUNK],
+}
+
+impl Block {
+    fn new(rank: u64) -> Self {
+        Self {
+            rank,
+            sub_blocks: [0; SUB_BLOCKS_PER_BLOCK],
+            bits: [0; BITS_PER_BLOCK / BITS_PER_CHUNK],
+        }
+    }
+
+    /// Set a bit without updating `self.sub_blocks`.
+    ///
+    /// This panics if the bit was already set, because that indicates that the original positions
+    /// list is invalid/had duplicates.
+    fn set(&mut self, index: usize) {
+        assert!(index < BITS_PER_BLOCK);
+        let chunk_idx = index / BITS_PER_CHUNK;
+        let bit_idx = index % BITS_PER_CHUNK;
+        let mask = 1 << ((BITS_PER_CHUNK - 1) - bit_idx);
+        assert_eq!(self.bits[chunk_idx] & mask, 0, "toggling bits off indicates that the original data was incorrect, most likely containing duplicate values.");
+        self.bits[chunk_idx] ^= mask;
+    }
+
+    /// Tests whether the bit at the given index is set.
+    #[allow(dead_code)]
+    fn get(&self, index: usize) -> bool {
+        assert!(index < BITS_PER_BLOCK);
+        let chunk_idx = index / BITS_PER_CHUNK;
+        let bit_idx = index % BITS_PER_CHUNK;
+        let mask = 1 << ((BITS_PER_CHUNK - 1) - bit_idx);
+        self.bits[chunk_idx] & mask != 0
+    }
+
+    /// The **total rank** of the block relative local index, and the index of the one
+    /// bit that establishes that rank (aka "select") **if** it occurs within that same
+    /// chunk, otherwise ['None'].  The assumption is that if you would have to look back
+    /// through previous chunks it would actually be cheaper to do a lookup in the original
+    /// data structure that the bit vector was created from.
+    fn rank_select(&self, local_idx: usize) -> (usize, Option<usize>) {
+        let mut rank = self.rank as usize;
+        let sub_block = local_idx / BITS_PER_SUB_BLOCK;
+        rank += self.sub_blocks[sub_block] as usize;
+
+        if BITS_PER_CHUNK != BITS_PER_SUB_BLOCK {
+            for i in sub_block * CHUNKS_PER_SUB_BLOCK..local_idx / BITS_PER_CHUNK {
+                rank += self.bits[i].count_ones() as usize;
+            }
+        }
+
+        let remainder = local_idx % BITS_PER_CHUNK;
+
+        let last_chunk = local_idx / BITS_PER_CHUNK;
+        let masked = if remainder == 0 {
+            0
+        } else {
+            self.bits[last_chunk] >> (BITS_PER_CHUNK - remainder)
+        };
+        rank += masked.count_ones() as usize;
+        let select = if masked == 0 {
+            None
+        } else {
+            Some(local_idx - masked.trailing_zeros() as usize - 1)
+        };
+        (rank, select)
+    }
+
+    fn total_rank(&self) -> usize {
+        self.sub_blocks[SUB_BLOCKS_PER_BLOCK - 1] as usize
+            + self.rank as usize
+            + self.bits[(SUB_BLOCKS_PER_BLOCK - 1) * CHUNKS_PER_SUB_BLOCK..]
+                .iter()
+                .map(|c| c.count_ones() as usize)
+                .sum::<usize>()
+    }
+
+    fn predecessor(&self, idx: usize) -> Option<usize> {
+        let sub_block = idx / BITS_PER_SUB_BLOCK;
+        let masked = self.bits[sub_block] >> (BITS_PER_SUB_BLOCK - 1 - idx % BITS_PER_SUB_BLOCK);
+        if masked > 0 {
+            Some(idx - masked.trailing_zeros() as usize)
+        } else {
+            for i in (0..sub_block).rev() {
+                let masked = self.bits[i];
+                if masked > 0 {
+                    return Some(
+                        (i + 1) * BITS_PER_SUB_BLOCK - masked.trailing_zeros() as usize - 1,
+                    );
+                }
+            }
+            None
+        }
+    }
+
+    fn successor(&self, idx: usize) -> Option<usize> {
+        let sub_block = idx / BITS_PER_SUB_BLOCK;
+        let masked = self.bits[sub_block] << (idx % BITS_PER_SUB_BLOCK);
+        if masked > 0 {
+            Some(idx + masked.leading_zeros() as usize)
+        } else {
+            for i in (sub_block + 1)..SUB_BLOCKS_PER_BLOCK {
+                let masked = self.bits[i];
+                if masked > 0 {
+                    return Some(i * BITS_PER_SUB_BLOCK + masked.leading_zeros() as usize);
+                }
+            }
+            None
+        }
+    }
+}
+
+impl Default for Block {
+    fn default() -> Self {
+        Block {
+            rank: 0,
+            sub_blocks: [0u16; SUB_BLOCKS_PER_BLOCK],
+            bits: [0; BITS_PER_BLOCK / BITS_PER_CHUNK],
+        }
+    }
+}
+
+/// Builder for creating a [`BitRank`].
+///
+/// # Examples
+///
+/// ```text
+/// // Note: This should work as a doctest, except this module is not public.
+/// let mut bytes = Vec::<u8>::new();
+///
+/// let mut builder = BitRankBuilder::new();
+/// builder.push(17);
+/// builder.push(23);
+/// builder.push(102);
+/// let set = builder.finish();
+/// assert_eq!(set.rank(100), 2);
+/// ```
+#[derive(Default)]
+pub struct BitRankBuilder {
+    blocks: Vec<Block>,
+    curr_rank: u64,
+    curr_block_id: usize,
+    curr_block: Option<Block>,
+}
+
+impl BitRankBuilder {
+    /// Returns a new builder.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    fn push_block(&mut self, mut block: Block) -> u64 {
+        let mut local_rank = 0;
+        for (i, chunk) in block.bits.iter().enumerate() {
+            // If the settings are ever changed, CHUNKS_PER_SUB_BLOCK will likely no longer be 1, so
+            // you will need this modulo.
+            #[expect(clippy::modulo_one)]
+            if i % CHUNKS_PER_SUB_BLOCK == 0 {
+                block.sub_blocks[i / CHUNKS_PER_SUB_BLOCK] = local_rank;
+            }
+            local_rank += chunk.count_ones() as u16;
+        }
+        let end_rank = block.rank + local_rank as u64;
+        self.blocks.push(block);
+        end_rank
+    }
+
+    /// Adds a bit. Bits must be added in order of increasing `position`.
+    pub fn push(&mut self, position: usize) {
+        let block_id = position / BITS_PER_BLOCK;
+        assert!(
+            self.curr_block_id <= block_id,
+            "positions must be increasing!"
+        );
+        while block_id > self.curr_block_id {
+            let curr_block = self
+                .curr_block
+                .take()
+                .unwrap_or_else(|| Block::new(self.curr_rank));
+            let end_rank = self.push_block(curr_block);
+            self.curr_rank = end_rank;
+            self.curr_block_id += 1;
+        }
+        match &mut self.curr_block {
+            None => {
+                let mut block = Block::new(self.curr_rank);
+                block.set(position % BITS_PER_BLOCK);
+                self.curr_block = Some(block);
+            }
+            Some(block) => {
+                block.set(position % BITS_PER_BLOCK);
+            }
+        }
+    }
+
+    /// Finishes the `BitRank` by writing the last block of data.
+    pub fn finish(mut self) -> BitRank {
+        if let Some(last_block) = self.curr_block.take() {
+            self.push_block(last_block);
+        }
+        BitRank {
+            blocks: self.blocks,
+        }
+    }
+}
+
+/// An immutable set of unsigned integers with an efficient `rank` method.
+#[derive(Clone)]
+pub struct BitRank {
+    blocks: Vec<Block>,
+}
+
+impl BitRank {
+    /// Creates a `BitRank` containing the integers in `iter`.
+    ///
+    /// # Panics
+    /// This may panic if the values produced by `iter` are not strictly increasing.
+    #[allow(clippy::should_implement_trait)]
+    #[allow(dead_code)]
+    pub fn from_iter<I: IntoIterator<Item = usize>>(iter: I) -> BitRank {
+        let mut builder = BitRankBuilder::new();
+        for position in iter {
+            builder.push(position);
+        }
+        builder.finish()
+    }
+
+    /// The rank at the specified index (exclusive).
+    ///
+    /// The (one) rank is defined as: `rank(i) = sum(b[j] for j in 0..i)`
+    /// i.e. the number of elements less than `i`.
+    pub fn rank(&self, idx: usize) -> usize {
+        self.rank_select(idx).0
+    }
+
+    /// Tests whether the bit at the given index is set.
+    #[allow(dead_code)]
+    pub fn get(&self, idx: usize) -> bool {
+        let block_num = idx / BITS_PER_BLOCK;
+        // assert!(block_num < self.blocks.len(), "index out of bounds");
+        if block_num >= self.blocks.len() {
+            false
+        } else {
+            self.blocks[block_num].get(idx % BITS_PER_BLOCK)
+        }
+    }
+
+    /// Returns the 1 bit at or before the specified index.
+    #[allow(dead_code)]
+    pub fn predecessor(&self, idx: usize) -> usize {
+        let block_num = idx / BITS_PER_BLOCK;
+        if block_num < self.blocks.len() {
+            if let Some(p) = self.blocks[block_num].predecessor(idx % BITS_PER_BLOCK) {
+                return block_num * BITS_PER_BLOCK + p;
+            }
+        }
+        for block_num in (0..self.blocks.len().min(block_num)).rev() {
+            if let Some(p) = self.blocks[block_num].predecessor(BITS_PER_BLOCK - 1) {
+                return block_num * BITS_PER_BLOCK + p;
+            }
+        }
+        panic!("no predecessor found!");
+    }
+
+    /// Returns the next 1 bit at or after the specified index.
+    #[allow(dead_code)]
+    pub fn successor(&self, idx: usize) -> usize {
+        let block_num = idx / BITS_PER_BLOCK;
+        if let Some(s) = self.blocks[block_num].successor(idx % BITS_PER_BLOCK) {
+            s + block_num * BITS_PER_BLOCK
+        } else {
+            for block_num in block_num + 1..self.blocks.len() {
+                if let Some(p) = self.blocks[block_num].successor(0) {
+                    return block_num * BITS_PER_BLOCK + p;
+                }
+            }
+            panic!("no successor found!");
+        }
+    }
+
+    /// Returns the number of elements in the set.
+    pub fn max_rank(&self) -> usize {
+        self.blocks
+            .last()
+            .map(|b| b.total_rank())
+            .unwrap_or_default() // fall back to 0 when the bitrank data structure is empty.
+    }
+
+    /// The rank at the specified index(exclusive) and the index of the one bit that
+    /// establishes that rank (aka "select") **if** it occurs within that same chunk,
+    /// otherwise ['None'].  The assumption is that if you would have to look back
+    /// through previous chunks it would actually be cheaper to do a lookup in the original
+    /// data structure that the bit vector was created from.
+    pub fn rank_select(&self, idx: usize) -> (usize, Option<usize>) {
+        let block_num = idx / BITS_PER_BLOCK;
+        // assert!(block_num < self.blocks.len(), "index out of bounds");
+        if block_num >= self.blocks.len() {
+            (
+                self.max_rank(), // fall back to 0 when the bitrank data structure is empty.
+                None,
+            )
+        } else {
+            let (rank, b_idx) = self.blocks[block_num].rank_select(idx % BITS_PER_BLOCK);
+            (rank, b_idx.map(|i| (block_num * BITS_PER_BLOCK) + i))
+        }
+    }
+
+    /// The total size of the bit vec that was allocated.
+    /// **Note:** This is more like capacity than normal `len` in that it does not
+    /// consider how much of the bit vec is actually used.
+    #[allow(dead_code)]
+    pub fn capacity(&self) -> usize {
+        self.blocks.len() * BITS_PER_BLOCK
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use itertools::Itertools;
+    use rand::distributions::Uniform;
+    use rand::prelude::*;
+    use rand_chacha::ChaCha8Rng;
+
+    use super::*;
+
+    fn write(positions: &[usize]) -> BitRank {
+        BitRank::from_iter(positions.iter().copied())
+    }
+
+    #[test]
+    fn test_rank_zero() {
+        let br = BitRank::from_iter([0]);
+        assert_eq!(br.rank(0), 0);
+        assert_eq!(br.rank(1), 1);
+    }
+
+    #[test]
+    fn test_empty() {
+        let br = BitRank::from_iter([]);
+        assert!(br.blocks.is_empty());
+    }
+
+    #[test]
+    fn test_index_out_of_bounds() {
+        let br = BitRank::from_iter([BITS_PER_BLOCK - 1]);
+        assert_eq!(br.rank(BITS_PER_BLOCK), 1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_duplicate_position() {
+        write(&[64, 66, 68, 68, 90]);
+    }
+
+    #[test]
+    fn test_rank_exclusive() {
+        let br = BitRank::from_iter(0..132);
+        assert_eq!(br.capacity(), BITS_PER_BLOCK);
+        assert_eq!(br.rank(64), 64);
+        assert_eq!(br.rank(132), 132);
+    }
+
+    #[test]
+    fn test_rank() {
+        let mut positions: Vec<usize> = (0..132).collect();
+        positions.append(&mut vec![138usize, 140, 146]);
+        let br = write(&positions);
+        assert_eq!(br.rank(135), 132);
+
+        let bits2: Vec<usize> = (0..BITS_PER_BLOCK - 5).collect();
+        let br2 = write(&bits2);
+        assert_eq!(br2.rank(169), 169);
+
+        let bits3: Vec<usize> = (0..BITS_PER_BLOCK + 5).collect();
+        let br3 = write(&bits3);
+        assert_eq!(br3.rank(BITS_PER_BLOCK), BITS_PER_BLOCK);
+    }
+
+    #[test]
+    fn test_rank_idx() {
+        let mut positions: Vec<usize> = (0..132).collect();
+        positions.append(&mut vec![138usize, 140, 146]);
+        let br = write(&positions);
+        assert_eq!(br.rank_select(135), (132, Some(131)));
+
+        let bits2: Vec<usize> = (0..BITS_PER_BLOCK - 5).collect();
+        let br2 = write(&bits2);
+        assert_eq!(br2.rank_select(169), (169, Some(168)));
+
+        let bits3: Vec<usize> = (0..BITS_PER_BLOCK + 5).collect();
+        let br3 = write(&bits3);
+        assert_eq!(br3.rank_select(BITS_PER_BLOCK), (BITS_PER_BLOCK, None));
+
+        let bits4: Vec<usize> = vec![1, 1000, 9999, BITS_PER_BLOCK + 1];
+        let br4 = write(&bits4);
+        assert_eq!(br4.rank_select(10000), (3, Some(9999)));
+
+        let bits5: Vec<usize> = vec![1, 1000, 9999, BITS_PER_BLOCK + 1];
+        let br5 = write(&bits5);
+        assert_eq!(br5.rank_select(BITS_PER_BLOCK), (3, None));
+    }
+
+    #[test]
+    fn test_rank_large_random() {
+        let mut rng = ChaCha8Rng::seed_from_u64(2);
+        let uniform = Uniform::<usize>::from(0..1_000_000);
+        let mut random_bits = Vec::with_capacity(100_000);
+        for _ in 0..100_000 {
+            random_bits.push(uniform.sample(&mut rng));
+        }
+        random_bits.sort_unstable();
+        // This isn't strictly necessary, given that the bit would just be toggled again, but it
+        // ensures that we are meeting the contract.
+        random_bits.dedup();
+        let br = write(&random_bits);
+        let mut rank = 0;
+        let mut select = None;
+        for i in 0..random_bits.capacity() {
+            if i % BITS_PER_CHUNK == 0 {
+                select = None;
+            }
+            assert_eq!(br.rank_select(i), (rank, select));
+            if i == random_bits[rank] {
+                rank += 1;
+                select = Some(i);
+            }
+        }
+    }
+
+    /// Test that we properly handle the case where the position is out of bounds for all
+    /// potentially tricky bit positions.
+    #[test]
+    fn test_rank_out_of_bounds() {
+        for i in 1..30 {
+            let br = write(&[BITS_PER_BLOCK * i - 1]);
+            assert_eq!(br.max_rank(), 1);
+            assert_eq!(br.rank(BITS_PER_BLOCK * i - 1), 0);
+            for j in 0..10 {
+                assert_eq!(br.rank(BITS_PER_BLOCK * (i + j)), 1);
+            }
+        }
+    }
+
+    #[test]
+    fn test_predecessor_and_successor() {
+        let mut rng = ChaCha8Rng::seed_from_u64(2);
+        let uniform = Uniform::<usize>::from(0..1_000_000);
+        let mut random_bits = Vec::with_capacity(100_000);
+        for _ in 0..100_000 {
+            random_bits.push(uniform.sample(&mut rng));
+        }
+        random_bits.sort_unstable();
+        random_bits.dedup();
+        let br = write(&random_bits);
+
+        for (i, j) in random_bits.iter().copied().tuple_windows() {
+            for k in i..j {
+                assert_eq!(br.successor(k + 1), j, "{i} {k} {j}");
+                assert_eq!(br.predecessor(k), i, "{i} {k} {j}");
+            }
+        }
+    }
+}
diff --git a/crates/utf8-converter/src/lib.rs b/crates/utf8-converter/src/lib.rs
new file mode 100644
index 0000000..182583d
--- /dev/null
+++ b/crates/utf8-converter/src/lib.rs
@@ -0,0 +1,577 @@
+//! Position calculator to convert between byte, char, and line positions.
+
+use std::ops::Range;
+
+mod bitrank;
+
+use bitrank::{BitRank, BitRankBuilder};
+
+/// Position calculator to convert between byte, char, and line positions.
+///
+/// Rust strings are UTF-8, but JavaScript has UTF-16 strings, while in Python, strings are
+/// sequences of Unicode code points. It's therefore necessary to adjust string positions when
+/// communicating across programming language boundaries. [`Utf8Converter`] does these adjustments.
+///
+/// ## Converting offsets
+///
+/// The conversion methods follow a naming scheme that uses these terms for different kinds of
+/// offsets:
+///
+/// - `utf8` - UTF-8 byte offsets (Rust style).
+/// - `utf16` - UTF-16 code unit offsets (JavaScript style).
+/// - `char` - Count of Unicode scalar values (Python style).
+/// - `utf16_pos` - Zero-based line number and `utf16` offset within the line.
+/// - `char_pos` - Zero-based line number and `char` offset within the line.
+///
+/// For example, [`Utf8Converter::utf8_to_utf16`] converts a Rust byte offset to a number that will
+/// index to the same position in a JavaScript string. Offsets are expressed as `u32` or [`Pos`]
+/// values.
+///
+/// All methods accept arguments that are off the end of the string (interpreting them as the end
+/// of the string).
+///
+/// ## Converting ranges
+///
+/// Some methods translate position *ranges*. These are expressed as `Range<u32>` except for
+/// `line`, which is a `u32`:
+///
+/// - `line` - Zero-based line numbers. The range a `line` refers to is the whole line, including
+///   the trailing newline character if any.
+/// - `lines` - A range of line numbers.
+/// - `utf8s` - UTF-8 byte ranges.
+/// - `utf16s` - UTF-16 code unit ranges.
+/// - `chars` - Ranges of Unicode scalar values.
+///
+/// When mapping offsets to line ranges, it is important to use a `_to_lines` function in order to
+/// end up with the correct line range. We have these methods because if you tried to do it
+/// yourself you would screw it up; use them! (And see the source code for
+/// [`Utf8Converter::utf8s_to_lines`] if you don't believe us.)
+///
+/// ## Complexity
+///
+/// Most operations run in O(1) time, some require O(log n) time. The memory consumed by this data
+/// structure is typically less than the memory occupied by the actual content. In the best case,
+/// it requires ~25% of the content space.
+pub struct Utf8Converter {
+    // Vector storing for every line the byte position at which the line starts.
+    line_begins: Vec<u32>,
+
+    // Encoded bitrank where the rank of a byte position corresponds to the line number to which
+    // the byte belongs.
+    utf8_to_line: BitRank,
+
+    // Encoded bitrank where the rank of a byte position corresponds to the char position to which
+    // the byte belongs.
+    utf8_to_char: BitRank,
+
+    // Encoded bitrank where the rank of a byte position corresponds to the UTF-16 encoded word
+    // position to which the byte belongs.
+    utf8_to_utf16: BitRank,
+
+    // Marks for every line whether it only consists of whitespace characters.
+    whitespace_only: Vec<bool>,
+}
+
+/// A position in a string, specified by line and column number.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct Pos {
+    /// Zero-indexed line number.
+    pub line: u32,
+    /// Zero-indexed column number. The units of this field depend on the method that produces the
+    /// value. See [`Utf8Converter::utf8_to_char_pos`], [`Utf8Converter::utf8_to_utf16_pos`].
+    pub col: u32,
+}
+
+// The actual conversion implementation between utf8, utf16, chars, and line numbers.
+// New methods must follow the existing conventions:
+//
+// - All conversions saturate when the input is out of bounds.
+// - Lines INCLUDE the terminating newline.
+// - Line numbers and column numbers are 0-based.
+// - `.xyz_to_lines(range)` methods behave like `.utf8_to_lines(the corresponding byte range)`.
+//
+// This last one is tricky, because in these methods, `range.begin` "rounds down" to the beginning
+// of the line, but `range.end` "rounds up"; and because there are many corner cases.
+//
+// E.g.: The empty character range at the end of one line cannot be distinguished from the empty
+// character range at the end of the subsequent line! This ambiguity is resolved by returning the
+// line which starts with the empty character range.
+//
+// Question: Consider whether we should return an empty line range in this case which would
+// probably be consistent from a mathematical point of view. But then we should also return empty
+// line ranges for empty character ranges in the middle of a line...
+impl Utf8Converter {
+    /// Collects position information for the given string.
+    pub fn new(content: &str) -> Self {
+        new_converter(content.as_bytes())
+    }
+
+    /// Collects position information for a byte-string.
+    ///
+    /// If `content` is UTF-8, this is just like [`Utf8Converter::new`]. Otherwise, the
+    /// conversion methods involving characters will produce unspecified (but memory-safe) results.
+    pub fn from_bytes(content: &[u8]) -> Self {
+        new_converter(content)
+    }
+
+    /// Returns the number of Unicode characters on the specified line.
+    pub fn line_chars(&self, line_number: u32) -> u32 {
+        let r = self.utf8s_to_chars(self.line_to_utf8s(line_number));
+        r.end - r.start
+    }
+
+    /// Returns the number of lines in the string.
+    pub fn lines(&self) -> u32 {
+        self.line_begins.len() as u32 - 1
+    }
+
+    pub fn only_whitespaces(&self, line_number: u32) -> bool {
+        self.whitespace_only
+            .get(line_number as usize)
+            .copied()
+            .unwrap_or(true)
+    }
+
+    /// Return the byte offset of the first character on the specified (zero-based) line.
+    ///
+    /// If `line_number` is greater than the number of lines in the text, this returns the length
+    /// of the string.
+    pub fn line_to_utf8_begin(&self, line_number: u32) -> u32 {
+        self.line_begins[line_number.min(self.lines()) as usize]
+    }
+
+    /// Python-style offset of the first character of a line.
+    pub fn line_to_char_begin(&self, line_number: u32) -> u32 {
+        self.utf8_to_char(self.line_to_utf8_begin(line_number))
+    }
+
+    /// JS-style offset of the first character of a line.
+    pub fn line_to_utf16_begin(&self, line_number: u32) -> u32 {
+        self.utf8_to_utf16(self.line_to_utf8_begin(line_number))
+    }
+
+    /// Rust-style offset of the first character of a line.
+    pub fn line_to_utf8_end(&self, line_number: u32) -> u32 {
+        self.line_to_utf8_begin(line_number + 1)
+    }
+
+    /// Python-style offset one past the end of a line (the offset of the start of the next line).
+    pub fn line_to_char_end(&self, line_number: u32) -> u32 {
+        self.utf8_to_char(self.line_to_utf8_end(line_number))
+    }
+
+    /// JS-style offset one past the end of a line (the offset of the start of the next line).
+    pub fn line_to_utf16_end(&self, line_number: u32) -> u32 {
+        self.utf8_to_utf16(self.line_to_utf8_end(line_number))
+    }
+
+    /// Rust-style offset one past the end of a line (the offset of the start of the next line).
+    pub fn line_to_utf8s(&self, line_number: u32) -> Range<u32> {
+        self.line_to_utf8_begin(line_number)..self.line_to_utf8_end(line_number)
+    }
+
+    /// Python-style offsets for the beginning and end of a line, including the newline if any.
+    pub fn line_to_chars(&self, line_number: u32) -> Range<u32> {
+        self.utf8s_to_chars(self.line_to_utf8s(line_number))
+    }
+
+    /// Rust-style offsets for the beginning and end of a line, including the newline if any.
+    pub fn lines_to_utf8s(&self, line_numbers: Range<u32>) -> Range<u32> {
+        self.line_to_utf8_begin(line_numbers.start)..self.line_to_utf8_begin(line_numbers.end)
+    }
+
+    /// Python-style offsets for the beginning and end of a range of lines, including the newline
+    /// of the last line, if any.
+    pub fn lines_to_chars(&self, line_numbers: Range<u32>) -> Range<u32> {
+        self.utf8s_to_chars(self.lines_to_utf8s(line_numbers))
+    }
+
+    /// Return the range of line numbers containing the substring specified by the Python-style
+    /// range `chars`. Newline characters count as part of the preceding line.
+    pub fn chars_to_lines(&self, chars: Range<u32>) -> Range<u32> {
+        self.utf8s_to_lines(self.chars_to_utf8s(chars))
+    }
+
+    /// Return the zero-based line number of the line containing the specified Rust-style offset.
+    /// Newline characters count as part of the preceding line.
+    pub fn utf8_to_line(&self, byte_number: u32) -> u32 {
+        self.utf8_to_line.rank(byte_number as usize) as u32
+    }
+
+    /// Converts a Rust-style offset to a zero-based line number and Python-style offset within the
+    /// line.
+    pub fn utf8_to_char_pos(&self, byte_number: u32) -> Pos {
+        let line = self.utf8_to_line(byte_number);
+        let line_start_char_number = self.line_to_char_begin(line);
+        let char_idx = self.utf8_to_char(byte_number);
+        Pos {
+            line,
+            col: char_idx - line_start_char_number,
+        }
+    }
+
+    /// Converts a Rust-style offset to a zero-based line number and JS-style offset within the
+    /// line.
+    pub fn utf8_to_utf16_pos(&self, byte_number: u32) -> Pos {
+        let line = self.utf8_to_line(byte_number);
+        let line_start_char_number = self.line_to_utf16_begin(line);
+        let char_idx = self.utf8_to_utf16(byte_number);
+        Pos {
+            line,
+            col: char_idx - line_start_char_number,
+        }
+    }
+
+    /// Returns the range of line numbers containing the substring specified by the Rust-style
+    /// range `bytes`. Newline characters count as part of the preceding line.
+    ///
+    /// If `bytes` is an empty range at a position within or at the beginning of a line, this
+    /// returns a nonempty range containing the line number of that one line. An empty range at or
+    /// beyond the end of the string translates to an empty range of line numbers.
+    pub fn utf8s_to_lines(&self, bytes: Range<u32>) -> Range<u32> {
+        // The fiddly parts of this formula are necessary because `bytes.start` rounds down to the
+        // beginning of the line, but `bytes.end` "rounds up" to the end of the line. the final
+        // `+1` is to produce a half-open range.
+        self.utf8_to_line(bytes.start)
+            ..self
+                .lines()
+                .min(self.utf8_to_line(bytes.end.saturating_sub(1).max(bytes.start)) + 1)
+    }
+
+    /// Converts a Rust-style offset to Python style.
+    pub fn utf8_to_char(&self, byte_number: u32) -> u32 {
+        self.utf8_to_char.rank(byte_number as usize) as u32
+    }
+
+    /// Converts a Rust-style offset to JS style.
+    pub fn utf8_to_utf16(&self, byte_number: u32) -> u32 {
+        self.utf8_to_utf16.rank(byte_number as usize) as u32
+    }
+
+    /// Converts a Python-style offset to Rust style.
+    pub fn char_to_utf8(&self, char_number: u32) -> u32 {
+        let mut byte_number = char_number;
+        for _ in 0..128 {
+            let char_number2 = self.utf8_to_char(byte_number);
+            if char_number2 == char_number {
+                return byte_number;
+            }
+            byte_number += char_number - char_number2;
+        }
+        // If we couldn't find the char within 128 steps, then the char_number might be invalid!
+        // This does not usually happen. For consistency with the rest of the code, we simply return
+        // the max utf8 position in this case.
+        if char_number > self.utf8_to_char.max_rank() as u32 {
+            return self
+                .line_begins
+                .last()
+                .copied()
+                .expect("last entry represents the length of the file!");
+        }
+        let limit = *self.line_begins.last().expect("no line begins");
+        // Otherwise, we keep searching, but are a bit more careful and add a check that we don't run into an infinite loop.
+        loop {
+            let char_number2 = self.utf8_to_char(byte_number);
+            if char_number2 == char_number {
+                return byte_number;
+            }
+            byte_number += char_number - char_number2;
+            assert!(byte_number < limit);
+        }
+    }
+
+    /// Converts a Rust-style offset range to Python style.
+    pub fn utf8s_to_chars(&self, bytes: Range<u32>) -> Range<u32> {
+        self.utf8_to_char(bytes.start)..self.utf8_to_char(bytes.end)
+    }
+
+    /// Converts a Python-style offset range to Rust style.
+    pub fn chars_to_utf8s(&self, chars: Range<u32>) -> Range<u32> {
+        self.char_to_utf8(chars.start)..self.char_to_utf8(chars.end)
+    }
+}
+
+fn new_converter(content: &[u8]) -> Utf8Converter {
+    let mut utf8_builder = BitRankBuilder::new();
+    let mut utf16_builder = BitRankBuilder::new();
+    let mut line_builder = BitRankBuilder::new();
+    let mut line_begins = vec![0];
+    let mut i = 0;
+    let mut whitespace_only = vec![];
+    let mut only_whitespaces = true; // true if all characters in the current line are whitespaces.
+    while i < content.len() {
+        // In case of invalid utf8, we might get a utf8_len of 0.
+        // In this case, we just treat the single byte character.
+        // In principle, a single incorrect byte can break the whole decoding...
+        let c = content[i];
+        let utf8_len = utf8_width(c).max(1);
+        if i > 0 {
+            utf8_builder.push(i - 1);
+            utf16_builder.push(i - 1);
+        }
+        if utf8_to_utf16_width(&content[i..]) > 1 {
+            utf16_builder.push(i);
+        }
+        if c == b'\n' {
+            whitespace_only.push(only_whitespaces);
+            line_begins.push(i as u32 + 1);
+            line_builder.push(i);
+            only_whitespaces = true; // reset for next line.
+        } else {
+            only_whitespaces &= matches!(c, b'\t' | b'\r' | b' ');
+        }
+        i += utf8_len;
+    }
+    if !content.is_empty() {
+        utf8_builder.push(content.len() - 1);
+        utf16_builder.push(content.len() - 1);
+    }
+    if line_begins.last() != Some(&(content.len() as u32)) {
+        whitespace_only.push(only_whitespaces);
+        line_begins.push(content.len() as u32);
+        line_builder.push(content.len() - 1);
+    }
+
+    Utf8Converter {
+        line_begins,
+        utf8_to_line: line_builder.finish(),
+        whitespace_only,
+        utf8_to_char: utf8_builder.finish(),
+        utf8_to_utf16: utf16_builder.finish(),
+    }
+}
+
+/// Returns true if, in a UTF-8 string, `b` always indicates the first byte of a character.
+///
+/// (This is true for bytes `0..=127` and `192..=255`.)
+pub fn is_char_boundary(b: u8) -> bool {
+    // Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128
+    // Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192
+    // The rest are bytes belonging to the first byte of multi byte encodings (11xxxxxx): b >= 192
+    //
+    // When interpreting the byte representation as signed integers, then numbers in the range
+    // 128..192 correspond to the smallest representable numbers. I.e. the two ranges [0, 128) and
+    // [192, 256) can be tested with a single signed comparison.
+    b as i8 >= -0x40 // NB: b < 128 || b >= 192
+}
+
+/// Returns the number of bytes this utf8 char occupies given the first byte of the utf8 encoding.
+/// Returns 0 if the byte is not a valid first byte of a utf8 char.
+fn utf8_width(c: u8) -> usize {
+    // Every nibble represents the utf8 length given the first 4 bits of a utf8 encoded byte.
+    const UTF8_WIDTH: usize = 0x4322_0000_1111_1111;
+    (UTF8_WIDTH >> ((c >> 4) * 4)) & 0xf
+}
+
+fn utf8_to_utf16_width(content: &[u8]) -> usize {
+    let len = utf8_width(content[0]);
+    match len {
+        0 => 0,
+        1..=3 => 1,
+        4 => 2,
+        _ => panic!("invalid utf8 char width: {}", len),
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::is_char_boundary;
+    use crate::{utf8_to_utf16_width, utf8_width, Pos, Utf8Converter};
+
+    #[test]
+    fn test_utf8_char_width() {
+        for c in '\0'..=char::MAX {
+            let mut dst = [0; 4];
+            let len = c.encode_utf8(&mut dst).len();
+            assert_eq!(len, utf8_width(dst[0]), "char: {:?} {len}", dst[0] >> 4);
+        }
+
+        for b in 0..=255u8 {
+            if !is_char_boundary(b) {
+                assert_eq!(utf8_width(b), 0, "char: {:?}", b >> 4);
+            } else {
+                assert!(utf8_width(b) > 0, "char: {:?}", b >> 4);
+            }
+        }
+    }
+
+    #[test]
+    fn test_utf8_to_utf16_len() {
+        for c in '\0'..=char::MAX {
+            let mut dst = [0; 4];
+            let _len = c.encode_utf8(&mut dst).len();
+            assert_eq!(utf8_to_utf16_width(&dst), c.len_utf16());
+        }
+
+        for b in 0..=255u8 {
+            if !is_char_boundary(b) {
+                assert_eq!(utf8_to_utf16_width(&[b]), 0);
+            }
+        }
+    }
+
+    #[test]
+    fn test_line_map() {
+        let content = r#"a short line.
+followed by another one.
+no terminating newline!"#;
+        let lines = Utf8Converter::new(content);
+        assert_eq!(lines.line_to_utf8s(0), 0..14);
+        assert_eq!(&content[0..14], "a short line.\n");
+        assert_eq!(lines.line_to_utf8s(1), 14..39);
+        assert_eq!(&content[14..39], "followed by another one.\n");
+        assert_eq!(lines.line_to_utf8s(2), 39..62);
+        assert_eq!(&content[39..62], "no terminating newline!");
+        assert_eq!(lines.utf8_to_line(0), 0);
+        assert_eq!(lines.utf8_to_line(13), 0);
+        assert_eq!(lines.utf8_to_line(14), 1);
+        assert_eq!(lines.utf8_to_line(38), 1);
+        assert_eq!(lines.utf8_to_line(39), 2);
+        assert_eq!(lines.utf8_to_line(61), 2);
+        assert_eq!(lines.utf8_to_line(62), 3); // <<-- this character is beyond the content.
+        assert_eq!(lines.utf8_to_line(100), 3);
+        assert_eq!(lines.utf8s_to_chars(4..10), 4..10);
+        assert_eq!(lines.chars_to_utf8s(4..10), 4..10);
+
+        assert_eq!(content.len(), 62);
+        assert_eq!(lines.lines_to_utf8s(2..3), 39..62);
+        assert_eq!(lines.lines_to_utf8s(2..4), 39..62);
+        assert_eq!(lines.lines_to_chars(2..4), 39..62);
+        assert_eq!(lines.utf8s_to_lines(39..62), 2..3);
+        assert_eq!(lines.utf8s_to_lines(39..63), 2..3); // The "invalid" utf8 position results in a valid line position.
+        assert_eq!(lines.char_to_utf8(62), 62);
+        assert_eq!(lines.char_to_utf8(63), 62); // char 63 doesn't exist, so we map to the closest valid utf8 position.
+
+        // Empty ranges
+        assert_eq!(lines.utf8s_to_lines(0..0), 0..1);
+        assert_eq!(lines.utf8s_to_lines(13..13), 0..1);
+        assert_eq!(lines.utf8s_to_lines(14..14), 1..2);
+        assert_eq!(lines.utf8s_to_lines(38..38), 1..2);
+        assert_eq!(lines.utf8s_to_lines(39..39), 2..3);
+        assert_eq!(lines.utf8s_to_lines(61..61), 2..3);
+        assert_eq!(lines.utf8s_to_lines(62..62), 3..3);
+        assert_eq!(lines.utf8s_to_lines(63..63), 3..3);
+    }
+
+    fn pos(line: u32, col: u32) -> Pos {
+        Pos { line, col }
+    }
+
+    #[test]
+    fn test_convert_ascii() {
+        let content = r#"line0
+line1"#;
+        let lines = Utf8Converter::new(content);
+        assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0));
+        assert_eq!(lines.utf8_to_char_pos(1), pos(0, 1));
+        assert_eq!(lines.utf8_to_char_pos(6), pos(1, 0));
+        assert_eq!(lines.utf8_to_char_pos(7), pos(1, 1));
+    }
+
+    #[test]
+    fn test_convert_unicode() {
+        // Á - 2 bytes utf8
+        let content = r#"❤️ line0
+line1
+✅ line2"#;
+        let lines = Utf8Converter::new(content);
+        assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); // ❤️ takes 6 bytes to represent in utf8 (2 code points)
+        assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0));
+        assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0));
+        assert_eq!(lines.utf8_to_char_pos(3), pos(0, 1));
+        assert_eq!(lines.utf8_to_char_pos(4), pos(0, 1));
+        assert_eq!(lines.utf8_to_char_pos(5), pos(0, 1));
+
+        assert_eq!(lines.utf8_to_char_pos(6), pos(0, 2)); // <space>
+        assert_eq!(lines.utf8_to_char_pos(7), pos(0, 3)); // line
+                                                          // ^
+
+        assert_eq!(lines.utf8_to_char_pos(13), pos(1, 0)); // line
+                                                           // ^
+
+        assert_eq!(lines.utf8_to_char_pos(19), pos(2, 0)); // ✅ takes 3 bytes to represent in utf8 (1 code point)
+        assert_eq!(lines.utf8_to_char_pos(20), pos(2, 0));
+        assert_eq!(lines.utf8_to_char_pos(21), pos(2, 0));
+
+        assert_eq!(lines.utf8_to_char_pos(22), pos(2, 1)); // <space>
+
+        assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); // ❤️ takes 4 bytes to represent in utf16 (2 code points)
+        assert_eq!(lines.utf8_to_utf16_pos(1), pos(0, 0));
+        assert_eq!(lines.utf8_to_utf16_pos(2), pos(0, 0));
+        assert_eq!(lines.utf8_to_utf16_pos(3), pos(0, 1));
+    }
+
+    #[test]
+    fn test_small() {
+        // Á - 2 bytes utf8
+        let content = r#"❤️ line0 ❤️Á 👋"#;
+        let lines = Utf8Converter::new(content);
+        let mut utf16_index = 0;
+        let mut char_index = 0;
+        for (byte_index, char) in content.char_indices() {
+            assert_eq!(lines.utf8_to_char(byte_index as u32), char_index);
+            assert_eq!(lines.utf8_to_utf16(byte_index as u32), utf16_index);
+            char_index += 1;
+            utf16_index += char.len_utf16() as u32;
+        }
+        assert_eq!(lines.utf8_to_char(content.len() as u32), char_index);
+        assert_eq!(lines.utf8_to_utf16(content.len() as u32), utf16_index);
+    }
+
+    #[test]
+    fn test_variable_lengths() {
+        let content = r#"❤️Á 👋"#;
+        //                   ^~ utf8: 1 char, 4 bytes, utf16: 2 code units
+        //                 ^~~~ utf8: 1 char, 1 byte, utf16: 1 code unit
+        //                ^~~~~ utf8: 1 char, 2 bytes, utf16: 1 code unit
+        //               ^~~~~~ utf8: 2 chars, 3 byte ea., utf16: 2 code units
+        let lines = Utf8Converter::new(content);
+
+        // UTF-16 positions
+        assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); // ❤️
+        assert_eq!(lines.utf8_to_utf16_pos(1), pos(0, 0));
+        assert_eq!(lines.utf8_to_utf16_pos(2), pos(0, 0));
+        assert_eq!(lines.utf8_to_utf16_pos(3), pos(0, 1));
+        assert_eq!(lines.utf8_to_utf16_pos(5), pos(0, 1));
+        assert_eq!(lines.utf8_to_utf16_pos(4), pos(0, 1));
+        assert_eq!(lines.utf8_to_utf16_pos(6), pos(0, 2)); // Á
+        assert_eq!(lines.utf8_to_utf16_pos(7), pos(0, 2));
+        assert_eq!(lines.utf8_to_utf16_pos(8), pos(0, 3)); // <space>
+        assert_eq!(lines.utf8_to_utf16_pos(9), pos(0, 4)); // 👋
+
+        // These middle utf8 byte positions don't have valid mappings:
+        // assert_eq!(lines.utf8_to_utf16_pos(10), pos(0, 4));
+        // assert_eq!(lines.utf8_to_utf16_pos(11), pos(0, 5));
+        //
+        // 👋 in utf16: 0xd83d 0xdc4b
+        // 👋 in utf8: 0xf0 0x9f 0x91 0x8b
+        //                  ^    ^
+        // It's not really defined where these inner bytes map to and it
+        // doesn't matter because we would never report those byte offset as
+        // they are in the middle of a character and therefore invalid.
+
+        assert_eq!(lines.utf8_to_utf16_pos(12), pos(0, 5));
+
+        // UTF-8 positions
+        assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); // ❤️
+        assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0));
+        assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0));
+        assert_eq!(lines.utf8_to_char_pos(3), pos(0, 1));
+        assert_eq!(lines.utf8_to_char_pos(4), pos(0, 1));
+        assert_eq!(lines.utf8_to_char_pos(5), pos(0, 1));
+        assert_eq!(lines.utf8_to_char_pos(6), pos(0, 2)); // Á
+        assert_eq!(lines.utf8_to_char_pos(7), pos(0, 2));
+        assert_eq!(lines.utf8_to_char_pos(8), pos(0, 3)); // <space>
+        assert_eq!(lines.utf8_to_char_pos(9), pos(0, 4)); // 👋
+        assert_eq!(lines.utf8_to_char_pos(10), pos(0, 4));
+        assert_eq!(lines.utf8_to_char_pos(11), pos(0, 4));
+        assert_eq!(lines.utf8_to_char_pos(12), pos(0, 4));
+    }
+
+    #[test]
+    fn test_critical_input_len() {
+        let content = [b'a'; 16384];
+        let lines = Utf8Converter::from_bytes(&content);
+        assert_eq!(lines.utf8_to_utf16_pos(16384), pos(1, 0));
+    }
+}

From d913518b7526351ccbbff409e0b8649aa4eb6959 Mon Sep 17 00:00:00 2001
From: Jason Orendorff <jorendorff@github.com>
Date: Fri, 1 Nov 2024 17:22:23 -0500
Subject: [PATCH 02/11] utf8-converter: Require docs for public items.

---
 crates/utf8-converter/src/lib.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crates/utf8-converter/src/lib.rs b/crates/utf8-converter/src/lib.rs
index 182583d..4b849f7 100644
--- a/crates/utf8-converter/src/lib.rs
+++ b/crates/utf8-converter/src/lib.rs
@@ -1,4 +1,5 @@
 //! Position calculator to convert between byte, char, and line positions.
+#![deny(missing_docs)]
 
 use std::ops::Range;
 
@@ -125,6 +126,7 @@ impl Utf8Converter {
         self.line_begins.len() as u32 - 1
     }
 
+    /// Returns true if the specified line is empty except for whitespace.
     pub fn only_whitespaces(&self, line_number: u32) -> bool {
         self.whitespace_only
             .get(line_number as usize)

From 91ef54d6c9ea3dcc99a7dfb03915604f850fdc08 Mon Sep 17 00:00:00 2001
From: Jason Orendorff <jorendorff@github.com>
Date: Fri, 8 Nov 2024 16:52:21 -0600
Subject: [PATCH 03/11] rename

---
 .../Cargo.toml                                |  2 +-
 .../README.md                                 |  0
 .../src/bitrank.rs                            |  0
 .../src/lib.rs                                | 32 +++++++++----------
 4 files changed, 17 insertions(+), 17 deletions(-)
 rename crates/{utf8-converter => string-offsets}/Cargo.toml (87%)
 rename crates/{utf8-converter => string-offsets}/README.md (100%)
 rename crates/{utf8-converter => string-offsets}/src/bitrank.rs (100%)
 rename crates/{utf8-converter => string-offsets}/src/lib.rs (96%)

diff --git a/crates/utf8-converter/Cargo.toml b/crates/string-offsets/Cargo.toml
similarity index 87%
rename from crates/utf8-converter/Cargo.toml
rename to crates/string-offsets/Cargo.toml
index 49da760..5bb425f 100644
--- a/crates/utf8-converter/Cargo.toml
+++ b/crates/string-offsets/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 authors = ["The blackbird team <support@github.com>"]
 edition = "2021"
-name = "utf8-converter"
+name = "string-offests"
 version = "0.1.0"
 
 [dependencies]
diff --git a/crates/utf8-converter/README.md b/crates/string-offsets/README.md
similarity index 100%
rename from crates/utf8-converter/README.md
rename to crates/string-offsets/README.md
diff --git a/crates/utf8-converter/src/bitrank.rs b/crates/string-offsets/src/bitrank.rs
similarity index 100%
rename from crates/utf8-converter/src/bitrank.rs
rename to crates/string-offsets/src/bitrank.rs
diff --git a/crates/utf8-converter/src/lib.rs b/crates/string-offsets/src/lib.rs
similarity index 96%
rename from crates/utf8-converter/src/lib.rs
rename to crates/string-offsets/src/lib.rs
index 4b849f7..70611fb 100644
--- a/crates/utf8-converter/src/lib.rs
+++ b/crates/string-offsets/src/lib.rs
@@ -11,7 +11,7 @@ use bitrank::{BitRank, BitRankBuilder};
 ///
 /// Rust strings are UTF-8, but JavaScript has UTF-16 strings, while in Python, strings are
 /// sequences of Unicode code points. It's therefore necessary to adjust string positions when
-/// communicating across programming language boundaries. [`Utf8Converter`] does these adjustments.
+/// communicating across programming language boundaries. [`StringOffsets`] does these adjustments.
 ///
 /// ## Converting offsets
 ///
@@ -24,7 +24,7 @@ use bitrank::{BitRank, BitRankBuilder};
 /// - `utf16_pos` - Zero-based line number and `utf16` offset within the line.
 /// - `char_pos` - Zero-based line number and `char` offset within the line.
 ///
-/// For example, [`Utf8Converter::utf8_to_utf16`] converts a Rust byte offset to a number that will
+/// For example, [`StringOffsets::utf8_to_utf16`] converts a Rust byte offset to a number that will
 /// index to the same position in a JavaScript string. Offsets are expressed as `u32` or [`Pos`]
 /// values.
 ///
@@ -46,14 +46,14 @@ use bitrank::{BitRank, BitRankBuilder};
 /// When mapping offsets to line ranges, it is important to use a `_to_lines` function in order to
 /// end up with the correct line range. We have these methods because if you tried to do it
 /// yourself you would screw it up; use them! (And see the source code for
-/// [`Utf8Converter::utf8s_to_lines`] if you don't believe us.)
+/// [`StringOffsets::utf8s_to_lines`] if you don't believe us.)
 ///
 /// ## Complexity
 ///
 /// Most operations run in O(1) time, some require O(log n) time. The memory consumed by this data
 /// structure is typically less than the memory occupied by the actual content. In the best case,
 /// it requires ~25% of the content space.
-pub struct Utf8Converter {
+pub struct StringOffsets {
     // Vector storing for every line the byte position at which the line starts.
     line_begins: Vec<u32>,
 
@@ -79,7 +79,7 @@ pub struct Pos {
     /// Zero-indexed line number.
     pub line: u32,
     /// Zero-indexed column number. The units of this field depend on the method that produces the
-    /// value. See [`Utf8Converter::utf8_to_char_pos`], [`Utf8Converter::utf8_to_utf16_pos`].
+    /// value. See [`StringOffsets::utf8_to_char_pos`], [`StringOffsets::utf8_to_utf16_pos`].
     pub col: u32,
 }
 
@@ -101,7 +101,7 @@ pub struct Pos {
 // Question: Consider whether we should return an empty line range in this case which would
 // probably be consistent from a mathematical point of view. But then we should also return empty
 // line ranges for empty character ranges in the middle of a line...
-impl Utf8Converter {
+impl StringOffsets {
     /// Collects position information for the given string.
     pub fn new(content: &str) -> Self {
         new_converter(content.as_bytes())
@@ -109,7 +109,7 @@ impl Utf8Converter {
 
     /// Collects position information for a byte-string.
     ///
-    /// If `content` is UTF-8, this is just like [`Utf8Converter::new`]. Otherwise, the
+    /// If `content` is UTF-8, this is just like [`StringOffsets::new`]. Otherwise, the
     /// conversion methods involving characters will produce unspecified (but memory-safe) results.
     pub fn from_bytes(content: &[u8]) -> Self {
         new_converter(content)
@@ -293,7 +293,7 @@ impl Utf8Converter {
     }
 }
 
-fn new_converter(content: &[u8]) -> Utf8Converter {
+fn new_converter(content: &[u8]) -> StringOffsets {
     let mut utf8_builder = BitRankBuilder::new();
     let mut utf16_builder = BitRankBuilder::new();
     let mut line_builder = BitRankBuilder::new();
@@ -334,7 +334,7 @@ fn new_converter(content: &[u8]) -> Utf8Converter {
         line_builder.push(content.len() - 1);
     }
 
-    Utf8Converter {
+    StringOffsets {
         line_begins,
         utf8_to_line: line_builder.finish(),
         whitespace_only,
@@ -378,7 +378,7 @@ fn utf8_to_utf16_width(content: &[u8]) -> usize {
 #[cfg(test)]
 mod test {
     use super::is_char_boundary;
-    use crate::{utf8_to_utf16_width, utf8_width, Pos, Utf8Converter};
+    use crate::{utf8_to_utf16_width, utf8_width, Pos, StringOffsets};
 
     #[test]
     fn test_utf8_char_width() {
@@ -417,7 +417,7 @@ mod test {
         let content = r#"a short line.
 followed by another one.
 no terminating newline!"#;
-        let lines = Utf8Converter::new(content);
+        let lines = StringOffsets::new(content);
         assert_eq!(lines.line_to_utf8s(0), 0..14);
         assert_eq!(&content[0..14], "a short line.\n");
         assert_eq!(lines.line_to_utf8s(1), 14..39);
@@ -463,7 +463,7 @@ no terminating newline!"#;
     fn test_convert_ascii() {
         let content = r#"line0
 line1"#;
-        let lines = Utf8Converter::new(content);
+        let lines = StringOffsets::new(content);
         assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0));
         assert_eq!(lines.utf8_to_char_pos(1), pos(0, 1));
         assert_eq!(lines.utf8_to_char_pos(6), pos(1, 0));
@@ -476,7 +476,7 @@ line1"#;
         let content = r#"❤️ line0
 line1
 ✅ line2"#;
-        let lines = Utf8Converter::new(content);
+        let lines = StringOffsets::new(content);
         assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); // ❤️ takes 6 bytes to represent in utf8 (2 code points)
         assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0));
         assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0));
@@ -507,7 +507,7 @@ line1
     fn test_small() {
         // Á - 2 bytes utf8
         let content = r#"❤️ line0 ❤️Á 👋"#;
-        let lines = Utf8Converter::new(content);
+        let lines = StringOffsets::new(content);
         let mut utf16_index = 0;
         let mut char_index = 0;
         for (byte_index, char) in content.char_indices() {
@@ -527,7 +527,7 @@ line1
         //                 ^~~~ utf8: 1 char, 1 byte, utf16: 1 code unit
         //                ^~~~~ utf8: 1 char, 2 bytes, utf16: 1 code unit
         //               ^~~~~~ utf8: 2 chars, 3 byte ea., utf16: 2 code units
-        let lines = Utf8Converter::new(content);
+        let lines = StringOffsets::new(content);
 
         // UTF-16 positions
         assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); // ❤️
@@ -573,7 +573,7 @@ line1
     #[test]
     fn test_critical_input_len() {
         let content = [b'a'; 16384];
-        let lines = Utf8Converter::from_bytes(&content);
+        let lines = StringOffsets::from_bytes(&content);
         assert_eq!(lines.utf8_to_utf16_pos(16384), pos(1, 0));
     }
 }

From 4683f6dcdf23e8625d52d90d504014d4acc476b6 Mon Sep 17 00:00:00 2001
From: Jason Orendorff <jorendorff@github.com>
Date: Fri, 8 Nov 2024 17:01:50 -0600
Subject: [PATCH 04/11] Update with the latest from upstream

---
 crates/string-offsets/Cargo.toml     |   4 +-
 crates/string-offsets/src/bitrank.rs | 159 +++++++++---------
 crates/string-offsets/src/lib.rs     | 237 ++++++++++++++-------------
 3 files changed, 200 insertions(+), 200 deletions(-)

diff --git a/crates/string-offsets/Cargo.toml b/crates/string-offsets/Cargo.toml
index 5bb425f..842b7f7 100644
--- a/crates/string-offsets/Cargo.toml
+++ b/crates/string-offsets/Cargo.toml
@@ -1,10 +1,10 @@
 [package]
 authors = ["The blackbird team <support@github.com>"]
 edition = "2021"
-name = "string-offests"
+name = "string-offsets"
 version = "0.1.0"
 
-[dependencies]
+[dev-dependencies]
 itertools = "0.13"
 rand = "0.8"
 rand_chacha = "0.3"
diff --git a/crates/string-offsets/src/bitrank.rs b/crates/string-offsets/src/bitrank.rs
index fc2fa87..88c77cb 100644
--- a/crates/string-offsets/src/bitrank.rs
+++ b/crates/string-offsets/src/bitrank.rs
@@ -3,15 +3,15 @@
 //!
 //! There is also an opportunistic `select` operation, but the general case has not been
 //! implemented.
+//!
+//! See also: ["Succinct data structure"](https://en.wikipedia.org/wiki/Succinct_data_structure).
 
-type Chunk = u128;
+type SubblockBits = u128;
 
 // Static sizing of the various components of the data structure.
 const BITS_PER_BLOCK: usize = 16384;
-const BITS_PER_SUB_BLOCK: usize = 128;
+const BITS_PER_SUB_BLOCK: usize = SubblockBits::BITS as usize;
 const SUB_BLOCKS_PER_BLOCK: usize = BITS_PER_BLOCK / BITS_PER_SUB_BLOCK;
-const BITS_PER_CHUNK: usize = 128;
-const CHUNKS_PER_SUB_BLOCK: usize = BITS_PER_SUB_BLOCK / BITS_PER_CHUNK;
 
 /// A container for a portion of the total bit vector and the associated indices.
 /// The bits within each chunk are stored from most significant bit (msb) to least significant bit (lsb).
@@ -30,7 +30,6 @@ const CHUNKS_PER_SUB_BLOCK: usize = BITS_PER_SUB_BLOCK / BITS_PER_CHUNK;
 /// sub-block rank:  [     0     ][     2     ]
 /// ```
 #[derive(Clone, Debug)]
-#[repr(C)]
 struct Block {
     /// Rank of the first bit in this block (that is, the number of bits set in previous blocks).
     rank: u64,
@@ -39,38 +38,29 @@ struct Block {
     /// sub-blocks `0..i`. `sub_blocks[0]` is always zero.
     sub_blocks: [u16; SUB_BLOCKS_PER_BLOCK],
     /// The bit-vector.
-    bits: [Chunk; BITS_PER_BLOCK / BITS_PER_CHUNK],
+    bits: [SubblockBits; SUB_BLOCKS_PER_BLOCK],
 }
 
 impl Block {
-    fn new(rank: u64) -> Self {
-        Self {
-            rank,
-            sub_blocks: [0; SUB_BLOCKS_PER_BLOCK],
-            bits: [0; BITS_PER_BLOCK / BITS_PER_CHUNK],
-        }
-    }
-
     /// Set a bit without updating `self.sub_blocks`.
     ///
     /// This panics if the bit was already set, because that indicates that the original positions
     /// list is invalid/had duplicates.
     fn set(&mut self, index: usize) {
         assert!(index < BITS_PER_BLOCK);
-        let chunk_idx = index / BITS_PER_CHUNK;
-        let bit_idx = index % BITS_PER_CHUNK;
-        let mask = 1 << ((BITS_PER_CHUNK - 1) - bit_idx);
+        let chunk_idx = index / BITS_PER_SUB_BLOCK;
+        let bit_idx = index % BITS_PER_SUB_BLOCK;
+        let mask = 1 << ((BITS_PER_SUB_BLOCK - 1) - bit_idx);
         assert_eq!(self.bits[chunk_idx] & mask, 0, "toggling bits off indicates that the original data was incorrect, most likely containing duplicate values.");
         self.bits[chunk_idx] ^= mask;
     }
 
     /// Tests whether the bit at the given index is set.
-    #[allow(dead_code)]
     fn get(&self, index: usize) -> bool {
         assert!(index < BITS_PER_BLOCK);
-        let chunk_idx = index / BITS_PER_CHUNK;
-        let bit_idx = index % BITS_PER_CHUNK;
-        let mask = 1 << ((BITS_PER_CHUNK - 1) - bit_idx);
+        let chunk_idx = index / BITS_PER_SUB_BLOCK;
+        let bit_idx = index % BITS_PER_SUB_BLOCK;
+        let mask = 1 << ((BITS_PER_SUB_BLOCK - 1) - bit_idx);
         self.bits[chunk_idx] & mask != 0
     }
 
@@ -84,19 +74,13 @@ impl Block {
         let sub_block = local_idx / BITS_PER_SUB_BLOCK;
         rank += self.sub_blocks[sub_block] as usize;
 
-        if BITS_PER_CHUNK != BITS_PER_SUB_BLOCK {
-            for i in sub_block * CHUNKS_PER_SUB_BLOCK..local_idx / BITS_PER_CHUNK {
-                rank += self.bits[i].count_ones() as usize;
-            }
-        }
-
-        let remainder = local_idx % BITS_PER_CHUNK;
+        let remainder = local_idx % BITS_PER_SUB_BLOCK;
 
-        let last_chunk = local_idx / BITS_PER_CHUNK;
+        let last_chunk = local_idx / BITS_PER_SUB_BLOCK;
         let masked = if remainder == 0 {
             0
         } else {
-            self.bits[last_chunk] >> (BITS_PER_CHUNK - remainder)
+            self.bits[last_chunk] >> (BITS_PER_SUB_BLOCK - remainder)
         };
         rank += masked.count_ones() as usize;
         let select = if masked == 0 {
@@ -110,7 +94,7 @@ impl Block {
     fn total_rank(&self) -> usize {
         self.sub_blocks[SUB_BLOCKS_PER_BLOCK - 1] as usize
             + self.rank as usize
-            + self.bits[(SUB_BLOCKS_PER_BLOCK - 1) * CHUNKS_PER_SUB_BLOCK..]
+            + self.bits[SUB_BLOCKS_PER_BLOCK - 1..]
                 .iter()
                 .map(|c| c.count_ones() as usize)
                 .sum::<usize>()
@@ -151,24 +135,11 @@ impl Block {
     }
 }
 
-impl Default for Block {
-    fn default() -> Self {
-        Block {
-            rank: 0,
-            sub_blocks: [0u16; SUB_BLOCKS_PER_BLOCK],
-            bits: [0; BITS_PER_BLOCK / BITS_PER_CHUNK],
-        }
-    }
-}
-
 /// Builder for creating a [`BitRank`].
 ///
 /// # Examples
 ///
 /// ```text
-/// // Note: This should work as a doctest, except this module is not public.
-/// let mut bytes = Vec::<u8>::new();
-///
 /// let mut builder = BitRankBuilder::new();
 /// builder.push(17);
 /// builder.push(23);
@@ -179,9 +150,6 @@ impl Default for Block {
 #[derive(Default)]
 pub struct BitRankBuilder {
     blocks: Vec<Block>,
-    curr_rank: u64,
-    curr_block_id: usize,
-    curr_block: Option<Block>,
 }
 
 impl BitRankBuilder {
@@ -190,55 +158,56 @@ impl BitRankBuilder {
         Self::default()
     }
 
-    fn push_block(&mut self, mut block: Block) -> u64 {
-        let mut local_rank = 0;
-        for (i, chunk) in block.bits.iter().enumerate() {
-            // If the settings are ever changed, CHUNKS_PER_SUB_BLOCK will likely no longer be 1, so
-            // you will need this modulo.
-            #[expect(clippy::modulo_one)]
-            if i % CHUNKS_PER_SUB_BLOCK == 0 {
-                block.sub_blocks[i / CHUNKS_PER_SUB_BLOCK] = local_rank;
+    /// Returns a builder that can hold integers with values `0..cap`.
+    pub fn with_capacity(cap: usize) -> Self {
+        Self {
+            blocks: Vec::with_capacity(cap.div_ceil(BITS_PER_BLOCK)),
+        }
+    }
+
+    fn finish_last_block(&mut self) -> u64 {
+        if let Some(block) = self.blocks.last_mut() {
+            let mut local_rank = 0;
+            for (i, chunk) in block.bits.iter().enumerate() {
+                block.sub_blocks[i] = local_rank;
+                local_rank += chunk.count_ones() as u16;
             }
-            local_rank += chunk.count_ones() as u16;
+            block.rank + local_rank as u64
+        } else {
+            0
         }
-        let end_rank = block.rank + local_rank as u64;
-        self.blocks.push(block);
-        end_rank
     }
 
     /// Adds a bit. Bits must be added in order of increasing `position`.
     pub fn push(&mut self, position: usize) {
         let block_id = position / BITS_PER_BLOCK;
         assert!(
-            self.curr_block_id <= block_id,
+            self.blocks.len() <= block_id + 1,
             "positions must be increasing!"
         );
-        while block_id > self.curr_block_id {
-            let curr_block = self
-                .curr_block
-                .take()
-                .unwrap_or_else(|| Block::new(self.curr_rank));
-            let end_rank = self.push_block(curr_block);
-            self.curr_rank = end_rank;
-            self.curr_block_id += 1;
-        }
-        match &mut self.curr_block {
-            None => {
-                let mut block = Block::new(self.curr_rank);
-                block.set(position % BITS_PER_BLOCK);
-                self.curr_block = Some(block);
-            }
-            Some(block) => {
-                block.set(position % BITS_PER_BLOCK);
+        if block_id >= self.blocks.len() {
+            let curr_rank = self.finish_last_block();
+            while block_id >= self.blocks.len() {
+                // Without this declared as a `const`, rustc 1.82 creates the Block value on the
+                // stack first, then `memcpy`s it into `self.blocks`.
+                const ZERO_BLOCK: Block = Block {
+                    rank: 0,
+                    sub_blocks: [0; SUB_BLOCKS_PER_BLOCK],
+                    bits: [0; SUB_BLOCKS_PER_BLOCK],
+                };
+                self.blocks.push(ZERO_BLOCK);
+                self.blocks.last_mut().expect("just inserted").rank = curr_rank;
             }
         }
+        self.blocks
+            .last_mut()
+            .expect("just ensured there are enough blocks")
+            .set(position % BITS_PER_BLOCK);
     }
 
     /// Finishes the `BitRank` by writing the last block of data.
     pub fn finish(mut self) -> BitRank {
-        if let Some(last_block) = self.curr_block.take() {
-            self.push_block(last_block);
-        }
+        self.finish_last_block();
         BitRank {
             blocks: self.blocks,
         }
@@ -256,8 +225,8 @@ impl BitRank {
     ///
     /// # Panics
     /// This may panic if the values produced by `iter` are not strictly increasing.
-    #[allow(clippy::should_implement_trait)]
     #[allow(dead_code)]
+    #[allow(clippy::should_implement_trait)]
     pub fn from_iter<I: IntoIterator<Item = usize>>(iter: I) -> BitRank {
         let mut builder = BitRankBuilder::new();
         for position in iter {
@@ -457,7 +426,7 @@ mod tests {
         let mut rank = 0;
         let mut select = None;
         for i in 0..random_bits.capacity() {
-            if i % BITS_PER_CHUNK == 0 {
+            if i % BITS_PER_SUB_BLOCK == 0 {
                 select = None;
             }
             assert_eq!(br.rank_select(i), (rank, select));
@@ -501,4 +470,30 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn test_large_gap() {
+        let br = BitRank::from_iter((3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17));
+        for i in 1..15 {
+            assert_eq!(br.rank(BITS_PER_BLOCK * i), 1);
+        }
+        for i in 0..18 {
+            assert_eq!(br.rank(BITS_PER_BLOCK * 15 + i), 1 + i);
+        }
+    }
+
+    #[test]
+    fn test_with_capacity() {
+        let mut b = BitRankBuilder::with_capacity(BITS_PER_BLOCK * 3 - 1);
+        let initial_capacity = b.blocks.capacity();
+        assert!(initial_capacity >= 3);
+        b.push(BITS_PER_BLOCK * 3 - 2); // should not have to grow
+        assert_eq!(b.blocks.capacity(), initial_capacity);
+
+        let mut b = BitRankBuilder::with_capacity(BITS_PER_BLOCK * 3 + 1);
+        let initial_capacity = b.blocks.capacity();
+        assert!(initial_capacity >= 4);
+        b.push(BITS_PER_BLOCK * 3); // should not have to grow
+        assert_eq!(b.blocks.capacity(), initial_capacity);
+    }
 }
diff --git a/crates/string-offsets/src/lib.rs b/crates/string-offsets/src/lib.rs
index 70611fb..9535dc0 100644
--- a/crates/string-offsets/src/lib.rs
+++ b/crates/string-offsets/src/lib.rs
@@ -1,4 +1,6 @@
-//! Position calculator to convert between byte, char, and line positions.
+//! Offset calculator to convert between byte, char, and line offsets in a string.
+//!
+//! See [`StringOffsets`] for details.
 #![deny(missing_docs)]
 
 use std::ops::Range;
@@ -7,11 +9,17 @@ mod bitrank;
 
 use bitrank::{BitRank, BitRankBuilder};
 
-/// Position calculator to convert between byte, char, and line positions.
+/// Offset calculator to convert between byte, char, and line offsets in a string.
 ///
-/// Rust strings are UTF-8, but JavaScript has UTF-16 strings, while in Python, strings are
-/// sequences of Unicode code points. It's therefore necessary to adjust string positions when
-/// communicating across programming language boundaries. [`StringOffsets`] does these adjustments.
+/// Rust strings are UTF-8, but JavaScript has UTF-16 strings, and in Python, strings are sequences
+/// of Unicode code points. It's therefore necessary to adjust string offsets when communicating
+/// across programming language boundaries. [`StringOffsets`] does these adjustments.
+///
+/// Each `StringOffsets` value contains offset information for a single string. [Building the
+/// data structure](StringOffsets::new) takes O(n) time and memory, but then each conversion is fast.
+///
+/// ["UTF-8 Conversions with BitRank"](https://adaptivepatchwork.com/2023/07/10/utf-conversion/)
+/// is a blog post explaining the implementation.
 ///
 /// ## Converting offsets
 ///
@@ -25,16 +33,16 @@ use bitrank::{BitRank, BitRankBuilder};
 /// - `char_pos` - Zero-based line number and `char` offset within the line.
 ///
 /// For example, [`StringOffsets::utf8_to_utf16`] converts a Rust byte offset to a number that will
-/// index to the same position in a JavaScript string. Offsets are expressed as `u32` or [`Pos`]
+/// index to the same position in a JavaScript string. Offsets are expressed as `usize` or [`Pos`]
 /// values.
 ///
-/// All methods accept arguments that are off the end of the string (interpreting them as the end
-/// of the string).
+/// All methods accept arguments that are past the end of the string, interpreting them as pointing
+/// to the end of the string.
 ///
 /// ## Converting ranges
 ///
-/// Some methods translate position *ranges*. These are expressed as `Range<u32>` except for
-/// `line`, which is a `u32`:
+/// Some methods translate position *ranges*. These are expressed as `Range<usize>` except for
+/// `line`, which is a `usize`:
 ///
 /// - `line` - Zero-based line numbers. The range a `line` refers to is the whole line, including
 ///   the trailing newline character if any.
@@ -50,26 +58,26 @@ use bitrank::{BitRank, BitRankBuilder};
 ///
 /// ## Complexity
 ///
-/// Most operations run in O(1) time, some require O(log n) time. The memory consumed by this data
-/// structure is typically less than the memory occupied by the actual content. In the best case,
-/// it requires ~25% of the content space.
+/// Most operations run in O(1) time. A few require O(log n) time. The memory consumed by this
+/// data structure is typically less than the memory occupied by the actual content. In the best
+/// case, it requires ~45% of the content space.
 pub struct StringOffsets {
-    // Vector storing for every line the byte position at which the line starts.
+    /// Vector storing, for every line, the byte position at which the line starts.
     line_begins: Vec<u32>,
 
-    // Encoded bitrank where the rank of a byte position corresponds to the line number to which
-    // the byte belongs.
+    /// Encoded bitrank where the rank of a byte position corresponds to the line number to which
+    /// the byte belongs.
     utf8_to_line: BitRank,
 
-    // Encoded bitrank where the rank of a byte position corresponds to the char position to which
-    // the byte belongs.
+    /// Encoded bitrank where the rank of a byte position corresponds to the char position to which
+    /// the byte belongs.
     utf8_to_char: BitRank,
 
-    // Encoded bitrank where the rank of a byte position corresponds to the UTF-16 encoded word
-    // position to which the byte belongs.
+    /// Encoded bitrank where the rank of a byte position corresponds to the UTF-16 encoded word
+    /// position to which the byte belongs.
     utf8_to_utf16: BitRank,
 
-    // Marks for every line whether it only consists of whitespace characters.
+    /// Marks, for every line, whether it consists only of whitespace characters.
     whitespace_only: Vec<bool>,
 }
 
@@ -77,10 +85,10 @@ pub struct StringOffsets {
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct Pos {
     /// Zero-indexed line number.
-    pub line: u32,
+    pub line: usize,
     /// Zero-indexed column number. The units of this field depend on the method that produces the
     /// value. See [`StringOffsets::utf8_to_char_pos`], [`StringOffsets::utf8_to_utf16_pos`].
-    pub col: u32,
+    pub col: usize,
 }
 
 // The actual conversion implementation between utf8, utf16, chars, and line numbers.
@@ -95,114 +103,113 @@ pub struct Pos {
 // of the line, but `range.end` "rounds up"; and because there are many corner cases.
 //
 // E.g.: The empty character range at the end of one line cannot be distinguished from the empty
-// character range at the end of the subsequent line! This ambiguity is resolved by returning the
+// character range at the start of the subsequent line! This ambiguity is resolved by returning the
 // line which starts with the empty character range.
 //
 // Question: Consider whether we should return an empty line range in this case which would
 // probably be consistent from a mathematical point of view. But then we should also return empty
 // line ranges for empty character ranges in the middle of a line...
 impl StringOffsets {
-    /// Collects position information for the given string.
+    /// Create a new converter to work with offsets into the given string.
     pub fn new(content: &str) -> Self {
         new_converter(content.as_bytes())
     }
 
-    /// Collects position information for a byte-string.
+    /// Create a new converter to work with offsets into the given byte-string.
     ///
     /// If `content` is UTF-8, this is just like [`StringOffsets::new`]. Otherwise, the
-    /// conversion methods involving characters will produce unspecified (but memory-safe) results.
+    /// conversion methods will produce unspecified (but memory-safe) results.
     pub fn from_bytes(content: &[u8]) -> Self {
         new_converter(content)
     }
 
     /// Returns the number of Unicode characters on the specified line.
-    pub fn line_chars(&self, line_number: u32) -> u32 {
+    pub fn line_chars(&self, line_number: usize) -> usize {
         let r = self.utf8s_to_chars(self.line_to_utf8s(line_number));
         r.end - r.start
     }
 
     /// Returns the number of lines in the string.
-    pub fn lines(&self) -> u32 {
-        self.line_begins.len() as u32 - 1
+    pub fn lines(&self) -> usize {
+        self.line_begins.len() - 1
     }
 
     /// Returns true if the specified line is empty except for whitespace.
-    pub fn only_whitespaces(&self, line_number: u32) -> bool {
+    pub fn only_whitespaces(&self, line_number: usize) -> bool {
         self.whitespace_only
-            .get(line_number as usize)
+            .get(line_number)
             .copied()
             .unwrap_or(true)
     }
 
     /// Return the byte offset of the first character on the specified (zero-based) line.
     ///
-    /// If `line_number` is greater than the number of lines in the text, this returns the length
-    /// of the string.
-    pub fn line_to_utf8_begin(&self, line_number: u32) -> u32 {
-        self.line_begins[line_number.min(self.lines()) as usize]
+    /// If `line_number` is greater than or equal to the number of lines in the text, this returns
+    /// the length of the string.
+    pub fn line_to_utf8_begin(&self, line_number: usize) -> usize {
+        self.line_begins[line_number.min(self.lines())] as usize
     }
 
-    /// Python-style offset of the first character of a line.
-    pub fn line_to_char_begin(&self, line_number: u32) -> u32 {
-        self.utf8_to_char(self.line_to_utf8_begin(line_number))
+    /// UTF-16 offset of the first character of a line.
+    ///
+    /// That is, return the offset that would point to the start of that line in a UTF-16
+    /// representation of the source string.
+    pub fn line_to_utf16_begin(&self, line_number: usize) -> usize {
+        self.utf8_to_utf16(self.line_to_utf8_begin(line_number))
     }
 
-    /// JS-style offset of the first character of a line.
-    pub fn line_to_utf16_begin(&self, line_number: u32) -> u32 {
-        self.utf8_to_utf16(self.line_to_utf8_begin(line_number))
+    /// UTF-32 offset of the first character of a line.
+    ///
+    /// That is, return the offset that would point to the start of that line in a UTF-32
+    /// representation of the source string.
+    pub fn line_to_char_begin(&self, line_number: usize) -> usize {
+        self.utf8_to_char(self.line_to_utf8_begin(line_number))
     }
 
-    /// Rust-style offset of the first character of a line.
-    pub fn line_to_utf8_end(&self, line_number: u32) -> u32 {
+    /// UTF-8 offset of the first character of a line.
+    pub fn line_to_utf8_end(&self, line_number: usize) -> usize {
         self.line_to_utf8_begin(line_number + 1)
     }
 
-    /// Python-style offset one past the end of a line (the offset of the start of the next line).
-    pub fn line_to_char_end(&self, line_number: u32) -> u32 {
-        self.utf8_to_char(self.line_to_utf8_end(line_number))
+    /// UTF-16 offset one past the end of a line (the offset of the start of the next line).
+    pub fn line_to_utf16_end(&self, line_number: usize) -> usize {
+        self.utf8_to_utf16(self.line_to_utf8_end(line_number))
     }
 
-    /// JS-style offset one past the end of a line (the offset of the start of the next line).
-    pub fn line_to_utf16_end(&self, line_number: u32) -> u32 {
-        self.utf8_to_utf16(self.line_to_utf8_end(line_number))
+    /// UTF-32 offset one past the end of a line (the offset of the start of the next line).
+    pub fn line_to_char_end(&self, line_number: usize) -> usize {
+        self.utf8_to_char(self.line_to_utf8_end(line_number))
     }
 
-    /// Rust-style offset one past the end of a line (the offset of the start of the next line).
-    pub fn line_to_utf8s(&self, line_number: u32) -> Range<u32> {
+    /// UTF-8 offset one past the end of a line (the offset of the start of the next line).
+    pub fn line_to_utf8s(&self, line_number: usize) -> Range<usize> {
         self.line_to_utf8_begin(line_number)..self.line_to_utf8_end(line_number)
     }
 
-    /// Python-style offsets for the beginning and end of a line, including the newline if any.
-    pub fn line_to_chars(&self, line_number: u32) -> Range<u32> {
+    /// UTF-32 offsets for the beginning and end of a line, including the newline if any.
+    pub fn line_to_chars(&self, line_number: usize) -> Range<usize> {
         self.utf8s_to_chars(self.line_to_utf8s(line_number))
     }
 
-    /// Rust-style offsets for the beginning and end of a line, including the newline if any.
-    pub fn lines_to_utf8s(&self, line_numbers: Range<u32>) -> Range<u32> {
+    /// UTF-8 offsets for the beginning and end of a range of lines, including the newline if any.
+    pub fn lines_to_utf8s(&self, line_numbers: Range<usize>) -> Range<usize> {
         self.line_to_utf8_begin(line_numbers.start)..self.line_to_utf8_begin(line_numbers.end)
     }
 
-    /// Python-style offsets for the beginning and end of a range of lines, including the newline
-    /// of the last line, if any.
-    pub fn lines_to_chars(&self, line_numbers: Range<u32>) -> Range<u32> {
+    /// UTF-32 offsets for the beginning and end of a range of lines, including the newline if any.
+    pub fn lines_to_chars(&self, line_numbers: Range<usize>) -> Range<usize> {
         self.utf8s_to_chars(self.lines_to_utf8s(line_numbers))
     }
 
-    /// Return the range of line numbers containing the substring specified by the Python-style
-    /// range `chars`. Newline characters count as part of the preceding line.
-    pub fn chars_to_lines(&self, chars: Range<u32>) -> Range<u32> {
-        self.utf8s_to_lines(self.chars_to_utf8s(chars))
-    }
-
-    /// Return the zero-based line number of the line containing the specified Rust-style offset.
+    /// Return the zero-based line number of the line containing the specified UTF-8 offset.
     /// Newline characters count as part of the preceding line.
-    pub fn utf8_to_line(&self, byte_number: u32) -> u32 {
-        self.utf8_to_line.rank(byte_number as usize) as u32
+    pub fn utf8_to_line(&self, byte_number: usize) -> usize {
+        self.utf8_to_line.rank(byte_number)
     }
 
-    /// Converts a Rust-style offset to a zero-based line number and Python-style offset within the
+    /// Converts a UTF-8 offset to a zero-based line number and UTF-32 offset within the
     /// line.
-    pub fn utf8_to_char_pos(&self, byte_number: u32) -> Pos {
+    pub fn utf8_to_char_pos(&self, byte_number: usize) -> Pos {
         let line = self.utf8_to_line(byte_number);
         let line_start_char_number = self.line_to_char_begin(line);
         let char_idx = self.utf8_to_char(byte_number);
@@ -212,9 +219,9 @@ impl StringOffsets {
         }
     }
 
-    /// Converts a Rust-style offset to a zero-based line number and JS-style offset within the
+    /// Converts a UTF-8 offset to a zero-based line number and UTF-16 offset within the
     /// line.
-    pub fn utf8_to_utf16_pos(&self, byte_number: u32) -> Pos {
+    pub fn utf8_to_utf16_pos(&self, byte_number: usize) -> Pos {
         let line = self.utf8_to_line(byte_number);
         let line_start_char_number = self.line_to_utf16_begin(line);
         let char_idx = self.utf8_to_utf16(byte_number);
@@ -230,7 +237,7 @@ impl StringOffsets {
     /// If `bytes` is an empty range at a position within or at the beginning of a line, this
     /// returns a nonempty range containing the line number of that one line. An empty range at or
     /// beyond the end of the string translates to an empty range of line numbers.
-    pub fn utf8s_to_lines(&self, bytes: Range<u32>) -> Range<u32> {
+    pub fn utf8s_to_lines(&self, bytes: Range<usize>) -> Range<usize> {
         // The fiddly parts of this formula are necessary because `bytes.start` rounds down to the
         // beginning of the line, but `bytes.end` "rounds up" to the end of the line. the final
         // `+1` is to produce a half-open range.
@@ -240,18 +247,24 @@ impl StringOffsets {
                 .min(self.utf8_to_line(bytes.end.saturating_sub(1).max(bytes.start)) + 1)
     }
 
-    /// Converts a Rust-style offset to Python style.
-    pub fn utf8_to_char(&self, byte_number: u32) -> u32 {
-        self.utf8_to_char.rank(byte_number as usize) as u32
+    /// Returns the range of line numbers containing the substring specified by the UTF-32
+    /// range `chars`. Newline characters count as part of the preceding line.
+    pub fn chars_to_lines(&self, chars: Range<usize>) -> Range<usize> {
+        self.utf8s_to_lines(self.chars_to_utf8s(chars))
+    }
+
+    /// Converts a UTF-8 offset to a UTF-32 offset.
+    pub fn utf8_to_char(&self, byte_number: usize) -> usize {
+        self.utf8_to_char.rank(byte_number)
     }
 
-    /// Converts a Rust-style offset to JS style.
-    pub fn utf8_to_utf16(&self, byte_number: u32) -> u32 {
-        self.utf8_to_utf16.rank(byte_number as usize) as u32
+    /// Converts a UTF-8 offset to a UTF-16 offset.
+    pub fn utf8_to_utf16(&self, byte_number: usize) -> usize {
+        self.utf8_to_utf16.rank(byte_number)
     }
 
-    /// Converts a Python-style offset to Rust style.
-    pub fn char_to_utf8(&self, char_number: u32) -> u32 {
+    /// Converts a UTF-32 offset to a UTF-8 offset.
+    pub fn char_to_utf8(&self, char_number: usize) -> usize {
         let mut byte_number = char_number;
         for _ in 0..128 {
             let char_number2 = self.utf8_to_char(byte_number);
@@ -263,14 +276,15 @@ impl StringOffsets {
         // If we couldn't find the char within 128 steps, then the char_number might be invalid!
         // This does not usually happen. For consistency with the rest of the code, we simply return
         // the max utf8 position in this case.
-        if char_number > self.utf8_to_char.max_rank() as u32 {
+        if char_number > self.utf8_to_char.max_rank() {
             return self
                 .line_begins
                 .last()
                 .copied()
-                .expect("last entry represents the length of the file!");
+                .expect("last entry represents the length of the file!")
+                as usize;
         }
-        let limit = *self.line_begins.last().expect("no line begins");
+        let limit = *self.line_begins.last().expect("no line begins") as usize;
         // Otherwise, we keep searching, but are a bit more careful and add a check that we don't run into an infinite loop.
         loop {
             let char_number2 = self.utf8_to_char(byte_number);
@@ -282,21 +296,22 @@ impl StringOffsets {
         }
     }
 
-    /// Converts a Rust-style offset range to Python style.
-    pub fn utf8s_to_chars(&self, bytes: Range<u32>) -> Range<u32> {
+    /// Converts a UTF-8 offset range to a UTF-32 offset range.
+    pub fn utf8s_to_chars(&self, bytes: Range<usize>) -> Range<usize> {
         self.utf8_to_char(bytes.start)..self.utf8_to_char(bytes.end)
     }
 
-    /// Converts a Python-style offset range to Rust style.
-    pub fn chars_to_utf8s(&self, chars: Range<u32>) -> Range<u32> {
+    /// Converts a UTF-32 offset range to a UTF-8 offset range.
+    pub fn chars_to_utf8s(&self, chars: Range<usize>) -> Range<usize> {
         self.char_to_utf8(chars.start)..self.char_to_utf8(chars.end)
     }
 }
 
 fn new_converter(content: &[u8]) -> StringOffsets {
-    let mut utf8_builder = BitRankBuilder::new();
-    let mut utf16_builder = BitRankBuilder::new();
-    let mut line_builder = BitRankBuilder::new();
+    let n = content.len();
+    let mut utf8_builder = BitRankBuilder::with_capacity(n);
+    let mut utf16_builder = BitRankBuilder::with_capacity(n);
+    let mut line_builder = BitRankBuilder::with_capacity(n);
     let mut line_begins = vec![0];
     let mut i = 0;
     let mut whitespace_only = vec![];
@@ -343,22 +358,8 @@ fn new_converter(content: &[u8]) -> StringOffsets {
     }
 }
 
-/// Returns true if, in a UTF-8 string, `b` always indicates the first byte of a character.
-///
-/// (This is true for bytes `0..=127` and `192..=255`.)
-pub fn is_char_boundary(b: u8) -> bool {
-    // Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128
-    // Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192
-    // The rest are bytes belonging to the first byte of multi byte encodings (11xxxxxx): b >= 192
-    //
-    // When interpreting the byte representation as signed integers, then numbers in the range
-    // 128..192 correspond to the smallest representable numbers. I.e. the two ranges [0, 128) and
-    // [192, 256) can be tested with a single signed comparison.
-    b as i8 >= -0x40 // NB: b < 128 || b >= 192
-}
-
-/// Returns the number of bytes this utf8 char occupies given the first byte of the utf8 encoding.
-/// Returns 0 if the byte is not a valid first byte of a utf8 char.
+/// Returns the number of bytes a UTF-8 char occupies, given the first byte of the UTF-8 encoding.
+/// Returns 0 if the byte is not a valid first byte of a UTF-8 char.
 fn utf8_width(c: u8) -> usize {
     // Every nibble represents the utf8 length given the first 4 bits of a utf8 encoded byte.
     const UTF8_WIDTH: usize = 0x4322_0000_1111_1111;
@@ -376,9 +377,13 @@ fn utf8_to_utf16_width(content: &[u8]) -> usize {
 }
 
 #[cfg(test)]
-mod test {
-    use super::is_char_boundary;
-    use crate::{utf8_to_utf16_width, utf8_width, Pos, StringOffsets};
+mod tests {
+    use super::*;
+
+    /// Returns true if, in a UTF-8 string, `b` indicates the first byte of a character.
+    fn is_char_boundary(b: u8) -> bool {
+        b as i8 >= -0x40 // NB: b < 128 || b >= 192
+    }
 
     #[test]
     fn test_utf8_char_width() {
@@ -455,7 +460,7 @@ no terminating newline!"#;
         assert_eq!(lines.utf8s_to_lines(63..63), 3..3);
     }
 
-    fn pos(line: u32, col: u32) -> Pos {
+    fn pos(line: usize, col: usize) -> Pos {
         Pos { line, col }
     }
 
@@ -511,13 +516,13 @@ line1
         let mut utf16_index = 0;
         let mut char_index = 0;
         for (byte_index, char) in content.char_indices() {
-            assert_eq!(lines.utf8_to_char(byte_index as u32), char_index);
-            assert_eq!(lines.utf8_to_utf16(byte_index as u32), utf16_index);
+            assert_eq!(lines.utf8_to_char(byte_index), char_index);
+            assert_eq!(lines.utf8_to_utf16(byte_index), utf16_index);
             char_index += 1;
-            utf16_index += char.len_utf16() as u32;
+            utf16_index += char.len_utf16();
         }
-        assert_eq!(lines.utf8_to_char(content.len() as u32), char_index);
-        assert_eq!(lines.utf8_to_utf16(content.len() as u32), utf16_index);
+        assert_eq!(lines.utf8_to_char(content.len()), char_index);
+        assert_eq!(lines.utf8_to_utf16(content.len()), utf16_index);
     }
 
     #[test]

From db9b2cb11b34ec3af3bc84daf8fed341a3d1a0a5 Mon Sep 17 00:00:00 2001
From: Jason Orendorff <jorendorff@github.com>
Date: Fri, 8 Nov 2024 17:03:22 -0600
Subject: [PATCH 05/11] renaming intensifies

---
 crates/string-offsets/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/string-offsets/README.md b/crates/string-offsets/README.md
index 668d35d..e7eb5d5 100644
--- a/crates/string-offsets/README.md
+++ b/crates/string-offsets/README.md
@@ -1,4 +1,4 @@
-# UTF-8 Converter
+# string-offsets
 
 This crate converts string positions between Rust style (UTF-8 byte offsets) and styles used by other programming languages, as well as line numbers.
 
@@ -8,5 +8,5 @@ Add this to your `Cargo.toml`:
 
 ```toml
 [dependencies]
-utf8-converter = "0.1"
+string-offsets = "0.1"
 ```

From fd056fbac6d422e0ef7fca58fed03c9b9bc5079a Mon Sep 17 00:00:00 2001
From: Jason Orendorff <jorendorff@github.com>
Date: Tue, 12 Nov 2024 09:49:24 -0600
Subject: [PATCH 06/11] update readme after renaming

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8d42c06..a97abe3 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ A collection of useful algorithms written in Rust. Currently contains:
 
 - [`geo_filters`](crates/geo_filters): probabilistic data structures that solve the [Distinct Count Problem](https://en.wikipedia.org/wiki/Count-distinct_problem) using geometric filters.
 - [`bpe`](crates/bpe): fast, correct, and novel algorithms for the [Byte Pair Encoding Algorithm](https://en.wikipedia.org/wiki/Large_language_model#BPE) which are particularly useful for chunking of documents.
-- [`utf8-converter`](crates/utf8-converter): converts string positions between bytes, chars, UTF-16 code units, and line numbers. Useful when sending string indices across language boundaries.
+- [`string-offsets`](crates/string-offsets): converts string positions between bytes, chars, UTF-16 code units, and line numbers. Useful when sending string indices across language boundaries.
 
 ## Background
 

From ef3257565c9d099eddd9f2d379566563654eae34 Mon Sep 17 00:00:00 2001
From: Jason Orendorff <jorendorff@github.com>
Date: Tue, 12 Nov 2024 11:20:24 -0600
Subject: [PATCH 07/11] Flesh out string-offsets README

---
 crates/string-offsets/README.md  | 35 +++++++++++++++++++++++++++++++-
 crates/string-offsets/src/lib.rs | 22 ++++++++++++++++++++
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/crates/string-offsets/README.md b/crates/string-offsets/README.md
index e7eb5d5..fd04fc6 100644
--- a/crates/string-offsets/README.md
+++ b/crates/string-offsets/README.md
@@ -1,6 +1,16 @@
 # string-offsets
 
-This crate converts string positions between Rust style (UTF-8 byte offsets) and styles used by other programming languages, as well as line numbers.
+Offset calculator to convert between byte, char, and line offsets in a string.
+
+Rust strings are UTF-8, but JavaScript has UTF-16 strings, and in Python, strings are sequences of
+Unicode code points. It's therefore necessary to adjust string offsets when communicating across
+programming language boundaries. [`StringOffsets`] does these adjustments.
+
+Each `StringOffsets` value contains offset information for a single string. [Building the data
+structure](StringOffsets::new) takes O(n) time and memory, but then each conversion is fast.
+
+["UTF-8 Conversions with BitRank"](https://adaptivepatchwork.com/2023/07/10/utf-conversion/) is a
+blog post explaining the implementation.
 
 ## Usage
 
@@ -10,3 +20,26 @@ Add this to your `Cargo.toml`:
 [dependencies]
 string-offsets = "0.1"
 ```
+
+Then:
+
+```rust
+use string_offsets::StringOffsets;
+
+let s = "☀️hello\n🗺️world\n";
+let offsets = StringOffsets::new(s);
+
+// Find offsets where lines begin and end.
+assert_eq!(offsets.line_to_utf8s(0), 0..12);  // note: 0-based line numbers
+
+// Translate string offsets between UTF-8 and other encodings.
+// This map emoji is 7 UTF-8 bytes...
+assert_eq!(&s[12..19], "🗺️");
+// ...but only 3 UTF-16 code units...
+assert_eq!(offsets.utf8_to_utf16(12), 8);
+assert_eq!(offsets.utf8_to_utf16(19), 11);
+// ...and only 2 Unicode characters.
+assert_eq!(offsets.utf8s_to_chars(12..19), 8..10);
+```
+
+See [the documentation](https://docs.rs/string-offsets/latest/string_offsets/struct.StringOffsets.html) for more.
diff --git a/crates/string-offsets/src/lib.rs b/crates/string-offsets/src/lib.rs
index 9535dc0..02ea569 100644
--- a/crates/string-offsets/src/lib.rs
+++ b/crates/string-offsets/src/lib.rs
@@ -1,5 +1,27 @@
 //! Offset calculator to convert between byte, char, and line offsets in a string.
 //!
+//!
+//! # Example
+//!
+//! ```
+//! use string_offsets::StringOffsets;
+//!
+//! let s = "☀️hello\n🗺️world\n";
+//! let offsets = StringOffsets::new(s);
+//!
+//! // Find offsets where lines begin and end.
+//! assert_eq!(offsets.line_to_utf8s(0), 0..12);  // note: 0-based line numbers
+//!
+//! // Translate string offsets between UTF-8 and other encodings.
+//! // This map emoji is 7 UTF-8 bytes...
+//! assert_eq!(&s[12..19], "🗺️");
+//! // ...but only 3 UTF-16 code units...
+//! assert_eq!(offsets.utf8_to_utf16(12), 8);
+//! assert_eq!(offsets.utf8_to_utf16(19), 11);
+//! // ...and only 2 Unicode characters.
+//! assert_eq!(offsets.utf8s_to_chars(12..19), 8..10);
+//! ```
+//!
 //! See [`StringOffsets`] for details.
 #![deny(missing_docs)]
 

From 2fcc9034bbeb030224ea261f656dcad5f239e7b6 Mon Sep 17 00:00:00 2001
From: Jason Orendorff <jorendorff@github.com>
Date: Tue, 12 Nov 2024 13:09:42 -0600
Subject: [PATCH 08/11] last-minute polishing

---
 crates/string-offsets/Cargo.toml | 9 +++++++--
 crates/string-offsets/src/lib.rs | 1 -
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/crates/string-offsets/Cargo.toml b/crates/string-offsets/Cargo.toml
index 842b7f7..dabf4d3 100644
--- a/crates/string-offsets/Cargo.toml
+++ b/crates/string-offsets/Cargo.toml
@@ -1,8 +1,13 @@
 [package]
-authors = ["The blackbird team <support@github.com>"]
-edition = "2021"
 name = "string-offsets"
+authors = ["The blackbird team <support@github.com>"]
 version = "0.1.0"
+edition = "2021"
+description = "Offset calculator to convert between byte, char, and line offsets in a string."
+repository = "https://github.com/github/rust-gems"
+license = "MIT"
+keywords = ["unicode", "string", "offsets", "positions", "interoperability"]
+categories = ["algorithms", "data-structures", "text-processing", "development-tools::ffi"]
 
 [dev-dependencies]
 itertools = "0.13"
diff --git a/crates/string-offsets/src/lib.rs b/crates/string-offsets/src/lib.rs
index 02ea569..a24d45c 100644
--- a/crates/string-offsets/src/lib.rs
+++ b/crates/string-offsets/src/lib.rs
@@ -1,6 +1,5 @@
 //! Offset calculator to convert between byte, char, and line offsets in a string.
 //!
-//!
 //! # Example
 //!
 //! ```

From bfaa2dee4e8bed3fdfeb1b99b04efe01685ac567 Mon Sep 17 00:00:00 2001
From: Jason Orendorff <jorendorff@github.com>
Date: Tue, 12 Nov 2024 13:18:48 -0600
Subject: [PATCH 09/11] remove dead code

---
 crates/string-offsets/src/bitrank.rs | 181 ++++-----------------------
 1 file changed, 26 insertions(+), 155 deletions(-)

diff --git a/crates/string-offsets/src/bitrank.rs b/crates/string-offsets/src/bitrank.rs
index 88c77cb..6524769 100644
--- a/crates/string-offsets/src/bitrank.rs
+++ b/crates/string-offsets/src/bitrank.rs
@@ -1,9 +1,6 @@
 //! A bit-vector data structure, optimized for
 //! [rank](http://bitmagic.io/rank-select.html) operations.
 //!
-//! There is also an opportunistic `select` operation, but the general case has not been
-//! implemented.
-//!
 //! See also: ["Succinct data structure"](https://en.wikipedia.org/wiki/Succinct_data_structure).
 
 type SubblockBits = u128;
@@ -55,15 +52,6 @@ impl Block {
         self.bits[chunk_idx] ^= mask;
     }
 
-    /// Tests whether the bit at the given index is set.
-    fn get(&self, index: usize) -> bool {
-        assert!(index < BITS_PER_BLOCK);
-        let chunk_idx = index / BITS_PER_SUB_BLOCK;
-        let bit_idx = index % BITS_PER_SUB_BLOCK;
-        let mask = 1 << ((BITS_PER_SUB_BLOCK - 1) - bit_idx);
-        self.bits[chunk_idx] & mask != 0
-    }
-
     /// The **total rank** of the block relative local index, and the index of the one
     /// bit that establishes that rank (aka "select") **if** it occurs within that same
     /// chunk, otherwise ['None'].  The assumption is that if you would have to look back
@@ -99,40 +87,6 @@ impl Block {
                 .map(|c| c.count_ones() as usize)
                 .sum::<usize>()
     }
-
-    fn predecessor(&self, idx: usize) -> Option<usize> {
-        let sub_block = idx / BITS_PER_SUB_BLOCK;
-        let masked = self.bits[sub_block] >> (BITS_PER_SUB_BLOCK - 1 - idx % BITS_PER_SUB_BLOCK);
-        if masked > 0 {
-            Some(idx - masked.trailing_zeros() as usize)
-        } else {
-            for i in (0..sub_block).rev() {
-                let masked = self.bits[i];
-                if masked > 0 {
-                    return Some(
-                        (i + 1) * BITS_PER_SUB_BLOCK - masked.trailing_zeros() as usize - 1,
-                    );
-                }
-            }
-            None
-        }
-    }
-
-    fn successor(&self, idx: usize) -> Option<usize> {
-        let sub_block = idx / BITS_PER_SUB_BLOCK;
-        let masked = self.bits[sub_block] << (idx % BITS_PER_SUB_BLOCK);
-        if masked > 0 {
-            Some(idx + masked.leading_zeros() as usize)
-        } else {
-            for i in (sub_block + 1)..SUB_BLOCKS_PER_BLOCK {
-                let masked = self.bits[i];
-                if masked > 0 {
-                    return Some(i * BITS_PER_SUB_BLOCK + masked.leading_zeros() as usize);
-                }
-            }
-            None
-        }
-    }
 }
 
 /// Builder for creating a [`BitRank`].
@@ -154,6 +108,7 @@ pub struct BitRankBuilder {
 
 impl BitRankBuilder {
     /// Returns a new builder.
+    #[cfg(test)]
     pub fn new() -> Self {
         Self::default()
     }
@@ -221,20 +176,6 @@ pub struct BitRank {
 }
 
 impl BitRank {
-    /// Creates a `BitRank` containing the integers in `iter`.
-    ///
-    /// # Panics
-    /// This may panic if the values produced by `iter` are not strictly increasing.
-    #[allow(dead_code)]
-    #[allow(clippy::should_implement_trait)]
-    pub fn from_iter<I: IntoIterator<Item = usize>>(iter: I) -> BitRank {
-        let mut builder = BitRankBuilder::new();
-        for position in iter {
-            builder.push(position);
-        }
-        builder.finish()
-    }
-
     /// The rank at the specified index (exclusive).
     ///
     /// The (one) rank is defined as: `rank(i) = sum(b[j] for j in 0..i)`
@@ -243,51 +184,6 @@ impl BitRank {
         self.rank_select(idx).0
     }
 
-    /// Tests whether the bit at the given index is set.
-    #[allow(dead_code)]
-    pub fn get(&self, idx: usize) -> bool {
-        let block_num = idx / BITS_PER_BLOCK;
-        // assert!(block_num < self.blocks.len(), "index out of bounds");
-        if block_num >= self.blocks.len() {
-            false
-        } else {
-            self.blocks[block_num].get(idx % BITS_PER_BLOCK)
-        }
-    }
-
-    /// Returns the 1 bit at or before the specified index.
-    #[allow(dead_code)]
-    pub fn predecessor(&self, idx: usize) -> usize {
-        let block_num = idx / BITS_PER_BLOCK;
-        if block_num < self.blocks.len() {
-            if let Some(p) = self.blocks[block_num].predecessor(idx % BITS_PER_BLOCK) {
-                return block_num * BITS_PER_BLOCK + p;
-            }
-        }
-        for block_num in (0..self.blocks.len().min(block_num)).rev() {
-            if let Some(p) = self.blocks[block_num].predecessor(BITS_PER_BLOCK - 1) {
-                return block_num * BITS_PER_BLOCK + p;
-            }
-        }
-        panic!("no predecessor found!");
-    }
-
-    /// Returns the next 1 bit at or after the specified index.
-    #[allow(dead_code)]
-    pub fn successor(&self, idx: usize) -> usize {
-        let block_num = idx / BITS_PER_BLOCK;
-        if let Some(s) = self.blocks[block_num].successor(idx % BITS_PER_BLOCK) {
-            s + block_num * BITS_PER_BLOCK
-        } else {
-            for block_num in block_num + 1..self.blocks.len() {
-                if let Some(p) = self.blocks[block_num].successor(0) {
-                    return block_num * BITS_PER_BLOCK + p;
-                }
-            }
-            panic!("no successor found!");
-        }
-    }
-
     /// Returns the number of elements in the set.
     pub fn max_rank(&self) -> usize {
         self.blocks
@@ -314,58 +210,55 @@ impl BitRank {
             (rank, b_idx.map(|i| (block_num * BITS_PER_BLOCK) + i))
         }
     }
-
-    /// The total size of the bit vec that was allocated.
-    /// **Note:** This is more like capacity than normal `len` in that it does not
-    /// consider how much of the bit vec is actually used.
-    #[allow(dead_code)]
-    pub fn capacity(&self) -> usize {
-        self.blocks.len() * BITS_PER_BLOCK
-    }
 }
 
 #[cfg(test)]
 mod tests {
-    use itertools::Itertools;
     use rand::distributions::Uniform;
     use rand::prelude::*;
     use rand_chacha::ChaCha8Rng;
 
     use super::*;
 
-    fn write(positions: &[usize]) -> BitRank {
-        BitRank::from_iter(positions.iter().copied())
+    /// Creates a `BitRank` containing the integers in `iter` (which should be strictly
+    /// increasing).
+    pub fn bitrank<I: IntoIterator<Item = usize>>(iter: I) -> BitRank {
+        let mut builder = BitRankBuilder::new();
+        for position in iter {
+            builder.push(position);
+        }
+        builder.finish()
     }
 
     #[test]
     fn test_rank_zero() {
-        let br = BitRank::from_iter([0]);
+        let br = bitrank([0]);
         assert_eq!(br.rank(0), 0);
         assert_eq!(br.rank(1), 1);
     }
 
     #[test]
     fn test_empty() {
-        let br = BitRank::from_iter([]);
+        let br = bitrank([]);
         assert!(br.blocks.is_empty());
     }
 
     #[test]
     fn test_index_out_of_bounds() {
-        let br = BitRank::from_iter([BITS_PER_BLOCK - 1]);
+        let br = bitrank([BITS_PER_BLOCK - 1]);
         assert_eq!(br.rank(BITS_PER_BLOCK), 1);
     }
 
     #[test]
     #[should_panic]
     fn test_duplicate_position() {
-        write(&[64, 66, 68, 68, 90]);
+        bitrank([64, 66, 68, 68, 90]);
     }
 
     #[test]
     fn test_rank_exclusive() {
-        let br = BitRank::from_iter(0..132);
-        assert_eq!(br.capacity(), BITS_PER_BLOCK);
+        let br = bitrank(0..132);
+        assert_eq!(br.blocks.len(), 1);
         assert_eq!(br.rank(64), 64);
         assert_eq!(br.rank(132), 132);
     }
@@ -374,15 +267,13 @@ mod tests {
     fn test_rank() {
         let mut positions: Vec<usize> = (0..132).collect();
         positions.append(&mut vec![138usize, 140, 146]);
-        let br = write(&positions);
+        let br = bitrank(positions);
         assert_eq!(br.rank(135), 132);
 
-        let bits2: Vec<usize> = (0..BITS_PER_BLOCK - 5).collect();
-        let br2 = write(&bits2);
+        let br2 = bitrank(0..BITS_PER_BLOCK - 5);
         assert_eq!(br2.rank(169), 169);
 
-        let bits3: Vec<usize> = (0..BITS_PER_BLOCK + 5).collect();
-        let br3 = write(&bits3);
+        let br3 = bitrank(0..BITS_PER_BLOCK + 5);
         assert_eq!(br3.rank(BITS_PER_BLOCK), BITS_PER_BLOCK);
     }
 
@@ -390,23 +281,23 @@ mod tests {
     fn test_rank_idx() {
         let mut positions: Vec<usize> = (0..132).collect();
         positions.append(&mut vec![138usize, 140, 146]);
-        let br = write(&positions);
+        let br = bitrank(positions);
         assert_eq!(br.rank_select(135), (132, Some(131)));
 
         let bits2: Vec<usize> = (0..BITS_PER_BLOCK - 5).collect();
-        let br2 = write(&bits2);
+        let br2 = bitrank(bits2);
         assert_eq!(br2.rank_select(169), (169, Some(168)));
 
         let bits3: Vec<usize> = (0..BITS_PER_BLOCK + 5).collect();
-        let br3 = write(&bits3);
+        let br3 = bitrank(bits3);
         assert_eq!(br3.rank_select(BITS_PER_BLOCK), (BITS_PER_BLOCK, None));
 
         let bits4: Vec<usize> = vec![1, 1000, 9999, BITS_PER_BLOCK + 1];
-        let br4 = write(&bits4);
+        let br4 = bitrank(bits4);
         assert_eq!(br4.rank_select(10000), (3, Some(9999)));
 
         let bits5: Vec<usize> = vec![1, 1000, 9999, BITS_PER_BLOCK + 1];
-        let br5 = write(&bits5);
+        let br5 = bitrank(bits5);
         assert_eq!(br5.rank_select(BITS_PER_BLOCK), (3, None));
     }
 
@@ -422,7 +313,7 @@ mod tests {
         // This isn't strictly necessary, given that the bit would just be toggled again, but it
         // ensures that we are meeting the contract.
         random_bits.dedup();
-        let br = write(&random_bits);
+        let br = bitrank(random_bits.iter().copied());
         let mut rank = 0;
         let mut select = None;
         for i in 0..random_bits.capacity() {
@@ -442,7 +333,7 @@ mod tests {
     #[test]
     fn test_rank_out_of_bounds() {
         for i in 1..30 {
-            let br = write(&[BITS_PER_BLOCK * i - 1]);
+            let br = bitrank([BITS_PER_BLOCK * i - 1]);
             assert_eq!(br.max_rank(), 1);
             assert_eq!(br.rank(BITS_PER_BLOCK * i - 1), 0);
             for j in 0..10 {
@@ -451,29 +342,9 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_predecessor_and_successor() {
-        let mut rng = ChaCha8Rng::seed_from_u64(2);
-        let uniform = Uniform::<usize>::from(0..1_000_000);
-        let mut random_bits = Vec::with_capacity(100_000);
-        for _ in 0..100_000 {
-            random_bits.push(uniform.sample(&mut rng));
-        }
-        random_bits.sort_unstable();
-        random_bits.dedup();
-        let br = write(&random_bits);
-
-        for (i, j) in random_bits.iter().copied().tuple_windows() {
-            for k in i..j {
-                assert_eq!(br.successor(k + 1), j, "{i} {k} {j}");
-                assert_eq!(br.predecessor(k), i, "{i} {k} {j}");
-            }
-        }
-    }
-
     #[test]
     fn test_large_gap() {
-        let br = BitRank::from_iter((3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17));
+        let br = bitrank((3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17));
         for i in 1..15 {
             assert_eq!(br.rank(BITS_PER_BLOCK * i), 1);
         }

From 3cecb0b23a8408ccc8300337d0539ba8070c8fd7 Mon Sep 17 00:00:00 2001
From: Jason Orendorff <jorendorff@github.com>
Date: Wed, 13 Nov 2024 11:32:41 -0600
Subject: [PATCH 10/11] address review comments

---
 crates/string-offsets/Cargo.toml |  4 ++--
 crates/string-offsets/README.md  |  6 +++---
 crates/string-offsets/src/lib.rs | 12 +++++++-----
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/crates/string-offsets/Cargo.toml b/crates/string-offsets/Cargo.toml
index dabf4d3..7d60dd2 100644
--- a/crates/string-offsets/Cargo.toml
+++ b/crates/string-offsets/Cargo.toml
@@ -3,10 +3,10 @@ name = "string-offsets"
 authors = ["The blackbird team <support@github.com>"]
 version = "0.1.0"
 edition = "2021"
-description = "Offset calculator to convert between byte, char, and line offsets in a string."
+description = "Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines."
 repository = "https://github.com/github/rust-gems"
 license = "MIT"
-keywords = ["unicode", "string", "offsets", "positions", "interoperability"]
+keywords = ["unicode", "positions", "utf16", "characters", "lines"]
 categories = ["algorithms", "data-structures", "text-processing", "development-tools::ffi"]
 
 [dev-dependencies]
diff --git a/crates/string-offsets/README.md b/crates/string-offsets/README.md
index fd04fc6..7ad8c23 100644
--- a/crates/string-offsets/README.md
+++ b/crates/string-offsets/README.md
@@ -1,13 +1,13 @@
 # string-offsets
 
-Offset calculator to convert between byte, char, and line offsets in a string.
+Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines.
 
 Rust strings are UTF-8, but JavaScript has UTF-16 strings, and in Python, strings are sequences of
 Unicode code points. It's therefore necessary to adjust string offsets when communicating across
 programming language boundaries. [`StringOffsets`] does these adjustments.
 
-Each `StringOffsets` value contains offset information for a single string. [Building the data
-structure](StringOffsets::new) takes O(n) time and memory, but then each conversion is fast.
+Each `StringOffsets` instance contains offset information for a single string. [Building the data
+structure](StringOffsets::new) takes O(n) time and memory, but then most conversions are O(1).
 
 ["UTF-8 Conversions with BitRank"](https://adaptivepatchwork.com/2023/07/10/utf-conversion/) is a
 blog post explaining the implementation.
diff --git a/crates/string-offsets/src/lib.rs b/crates/string-offsets/src/lib.rs
index a24d45c..ee05e54 100644
--- a/crates/string-offsets/src/lib.rs
+++ b/crates/string-offsets/src/lib.rs
@@ -1,4 +1,4 @@
-//! Offset calculator to convert between byte, char, and line offsets in a string.
+//! Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines.
 //!
 //! # Example
 //!
@@ -17,7 +17,7 @@
 //! // ...but only 3 UTF-16 code units...
 //! assert_eq!(offsets.utf8_to_utf16(12), 8);
 //! assert_eq!(offsets.utf8_to_utf16(19), 11);
-//! // ...and only 2 Unicode characters.
+//! // ...and only 2 Unicode code points.
 //! assert_eq!(offsets.utf8s_to_chars(12..19), 8..10);
 //! ```
 //!
@@ -30,14 +30,16 @@ mod bitrank;
 
 use bitrank::{BitRank, BitRankBuilder};
 
-/// Offset calculator to convert between byte, char, and line offsets in a string.
+/// Converts positions within a given string between UTF-8 byte offsets (the usual in Rust), UTF-16
+/// code units, Unicode code points, and line numbers.
 ///
 /// Rust strings are UTF-8, but JavaScript has UTF-16 strings, and in Python, strings are sequences
 /// of Unicode code points. It's therefore necessary to adjust string offsets when communicating
 /// across programming language boundaries. [`StringOffsets`] does these adjustments.
 ///
-/// Each `StringOffsets` value contains offset information for a single string. [Building the
-/// data structure](StringOffsets::new) takes O(n) time and memory, but then each conversion is fast.
+/// Each `StringOffsets` instance contains offset information for a single string. [Building the
+/// data structure](StringOffsets::new) takes O(n) time and memory, but then most conversions are
+/// O(1).
 ///
 /// ["UTF-8 Conversions with BitRank"](https://adaptivepatchwork.com/2023/07/10/utf-conversion/)
 /// is a blog post explaining the implementation.

From a2735a452e74dc3c0a7b4e3debaeebcbfa7962d1 Mon Sep 17 00:00:00 2001
From: Jason Orendorff <jorendorff@github.com>
Date: Wed, 13 Nov 2024 11:46:43 -0600
Subject: [PATCH 11/11] Remove obsolete dev-dependency on itertools

---
 crates/string-offsets/Cargo.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crates/string-offsets/Cargo.toml b/crates/string-offsets/Cargo.toml
index 7d60dd2..fd9b838 100644
--- a/crates/string-offsets/Cargo.toml
+++ b/crates/string-offsets/Cargo.toml
@@ -10,6 +10,5 @@ keywords = ["unicode", "positions", "utf16", "characters", "lines"]
 categories = ["algorithms", "data-structures", "text-processing", "development-tools::ffi"]
 
 [dev-dependencies]
-itertools = "0.13"
 rand = "0.8"
 rand_chacha = "0.3"