diff --git a/crates/string-offsets/Cargo.toml b/crates/string-offsets/Cargo.toml index dabf4d3..7d60dd2 100644 --- a/crates/string-offsets/Cargo.toml +++ b/crates/string-offsets/Cargo.toml @@ -3,10 +3,10 @@ name = "string-offsets" authors = ["The blackbird team "] version = "0.1.0" edition = "2021" -description = "Offset calculator to convert between byte, char, and line offsets in a string." +description = "Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines." repository = "https://github.com/github/rust-gems" license = "MIT" -keywords = ["unicode", "string", "offsets", "positions", "interoperability"] +keywords = ["unicode", "positions", "utf16", "characters", "lines"] categories = ["algorithms", "data-structures", "text-processing", "development-tools::ffi"] [dev-dependencies] diff --git a/crates/string-offsets/README.md b/crates/string-offsets/README.md index fd04fc6..7ad8c23 100644 --- a/crates/string-offsets/README.md +++ b/crates/string-offsets/README.md @@ -1,13 +1,13 @@ # string-offsets -Offset calculator to convert between byte, char, and line offsets in a string. +Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines. Rust strings are UTF-8, but JavaScript has UTF-16 strings, and in Python, strings are sequences of Unicode code points. It's therefore necessary to adjust string offsets when communicating across programming language boundaries. [`StringOffsets`] does these adjustments. -Each `StringOffsets` value contains offset information for a single string. [Building the data -structure](StringOffsets::new) takes O(n) time and memory, but then each conversion is fast. +Each `StringOffsets` instance contains offset information for a single string. [Building the data +structure](StringOffsets::new) takes O(n) time and memory, but then most conversions are O(1). ["UTF-8 Conversions with BitRank"](https://adaptivepatchwork.com/2023/07/10/utf-conversion/) is a blog post explaining the implementation. diff --git a/crates/string-offsets/src/lib.rs b/crates/string-offsets/src/lib.rs index a24d45c..ee05e54 100644 --- a/crates/string-offsets/src/lib.rs +++ b/crates/string-offsets/src/lib.rs @@ -1,4 +1,4 @@ -//! Offset calculator to convert between byte, char, and line offsets in a string. +//! Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines. //! //! # Example //! @@ -17,7 +17,7 @@ //! // ...but only 3 UTF-16 code units... //! assert_eq!(offsets.utf8_to_utf16(12), 8); //! assert_eq!(offsets.utf8_to_utf16(19), 11); -//! // ...and only 2 Unicode characters. +//! // ...and only 2 Unicode code points. //! assert_eq!(offsets.utf8s_to_chars(12..19), 8..10); //! ``` //! @@ -30,14 +30,16 @@ mod bitrank; use bitrank::{BitRank, BitRankBuilder}; -/// Offset calculator to convert between byte, char, and line offsets in a string. +/// Converts positions within a given string between UTF-8 byte offsets (the usual in Rust), UTF-16 +/// code units, Unicode code points, and line numbers. /// /// Rust strings are UTF-8, but JavaScript has UTF-16 strings, and in Python, strings are sequences /// of Unicode code points. It's therefore necessary to adjust string offsets when communicating /// across programming language boundaries. [`StringOffsets`] does these adjustments. /// -/// Each `StringOffsets` value contains offset information for a single string. [Building the -/// data structure](StringOffsets::new) takes O(n) time and memory, but then each conversion is fast. +/// Each `StringOffsets` instance contains offset information for a single string. [Building the +/// data structure](StringOffsets::new) takes O(n) time and memory, but then most conversions are +/// O(1). /// /// ["UTF-8 Conversions with BitRank"](https://adaptivepatchwork.com/2023/07/10/utf-conversion/) /// is a blog post explaining the implementation.