diff --git a/Cargo.toml b/Cargo.toml index 41bca52..e2baf92 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,3 +19,6 @@ normalization = ["unicode-normalization"] [dependencies] unicode-segmentation = { version = "1.7.1", optional = true } unicode-normalization = { version = "0.1.17", optional = true } + +[dev-dependencies] +rand = "0.8.0" diff --git a/src/normalization.rs b/src/normalization.rs index 4825498..a207462 100644 --- a/src/normalization.rs +++ b/src/normalization.rs @@ -5,20 +5,31 @@ //! you might consider the [LowerCaseNormalizer]. //! //! ``` -//! # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer, FormCNormalization, MultipleNormalizer}; +//! # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer, FormCNormalizer, ComposedNormalizer}; //! assert_eq!(LowerCaseNormalizer.normalize("this STRING"), LowerCaseNormalizer.normalize("THIS string")); //! let a1 = "ä"; // U+00E4 //! let a2 = "ä"; // U+0061 + U+0308 //! let a3 = "Ä"; // U+0041 + U+0308 //! assert_ne!(a1, a2); -//! assert_eq!(FormCNormalization.normalize(a2), a1); -//! let multiple_normalizers = MultipleNormalizer::with(vec![Box::new(LowerCaseNormalizer), Box::new(FormCNormalization)]); +//! assert_eq!(FormCNormalizer.normalize(a2), a1); +//! let multiple_normalizers = ComposedNormalizer::with( +//! vec![Box::new(LowerCaseNormalizer), Box::new(FormCNormalizer)]); //! assert_eq!(multiple_normalizers.normalize(a3), a1); //! ``` /// Represents a strategy for normalizing string characters into a canonical value of their equivalence class. /// /// i.e., in a case-insensitive context, 'a' might be the canonical value for the equivalence class of ASCII A's: `['a', 'A']`. +/// +/// In addition to implementers of the trait, functions with a matching type signature also work. +/// ``` +/// # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer}; +/// let test_string = "test STRING"; +/// fn custom_normalizer(s: &str) -> String { s.to_lowercase() } +/// assert_eq!( +/// LowerCaseNormalizer.normalize(&test_string), +/// custom_normalizer.normalize(&test_string)); +/// ``` pub trait Normalizer { fn normalize(&self, s: &str) -> String; } @@ -30,6 +41,16 @@ impl String> Normalizer for F { } /// Doesn't modify any characters. The Identity-transform [Normalizer]. +/// +/// ``` +/// # use fuzzywuzzy::normalization::{Normalizer, PassthroughNormalizer}; +/// use rand::{thread_rng, Rng}; +/// use rand::distributions::Alphanumeric; +/// let random_string: String = thread_rng() +/// .sample_iter(&Alphanumeric) +/// .take(16).map(char::from).collect(); +/// assert_eq!(PassthroughNormalizer.normalize(&random_string), random_string); +/// ``` pub struct PassthroughNormalizer; impl Normalizer for PassthroughNormalizer { @@ -38,21 +59,28 @@ impl Normalizer for PassthroughNormalizer { } } -// ew, need a better name /// Compose a sequence of [Normalizer]s together into one [Normalizer]. /// -/// They are executed in order. -pub struct MultipleNormalizer { +/// They are executed in sequential order. +/// ``` +/// # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer, FormCNormalizer, ComposedNormalizer}; +/// let a1 = "ä"; // U+00E4 +/// let a2 = "Ä"; // U+0041 + U+0308 +/// let multiple_normalizers = ComposedNormalizer::with( +/// vec![Box::new(LowerCaseNormalizer), Box::new(FormCNormalizer)]); +/// assert_eq!(multiple_normalizers.normalize(a2), a1); +/// ``` +pub struct ComposedNormalizer { normalizers: Vec>, } -impl MultipleNormalizer { - pub fn with(normalizers: Vec>) -> MultipleNormalizer { - MultipleNormalizer { normalizers } +impl ComposedNormalizer { + pub fn with(normalizers: Vec>) -> ComposedNormalizer { + ComposedNormalizer { normalizers } } } -impl Normalizer for MultipleNormalizer { +impl Normalizer for ComposedNormalizer { fn normalize(&self, s: &str) -> String { let mut current = s.to_owned(); for normalizer in self.normalizers.iter() { @@ -63,6 +91,11 @@ impl Normalizer for MultipleNormalizer { } /// Normalizes strings by lower-casing all letters. +/// +/// ``` +/// # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer}; +/// assert_eq!(LowerCaseNormalizer.normalize("this STRING"), LowerCaseNormalizer.normalize("THIS string")); +/// ``` pub struct LowerCaseNormalizer; impl Normalizer for LowerCaseNormalizer { @@ -72,9 +105,17 @@ impl Normalizer for LowerCaseNormalizer { } /// Removes non-ASCII codepoints. -pub struct AsciiOnlyFilter; - -impl Normalizer for AsciiOnlyFilter { +/// +/// Notably, this does not ASCII-ify non-ASCII characters, it just removes them. +/// If you want to turn characters into ASCII decompositions, look at +/// [FormDNormalizer], [FormKDNormalizer], or [UnicodeToAsciiNormalizer]. +/// ``` +/// # use fuzzywuzzy::normalization::{Normalizer, AsciiOnlyNormalizer}; +/// assert_eq!(AsciiOnlyNormalizer.normalize("äbc"), "bc"); +/// ``` +pub struct AsciiOnlyNormalizer; + +impl Normalizer for AsciiOnlyNormalizer { fn normalize(&self, s: &str) -> String { s.chars().filter(char::is_ascii).collect() } @@ -88,74 +129,105 @@ mod unicode_normalizers { use super::Normalizer; use unicode_normalization::UnicodeNormalization; - /// Performs Unicode Normalization Form C (canonical decomposition followed by canonical composition). + /// Performs Unicode Normalization Form C (canonical decomposition followed by canonical composition). Requires default feature "normalization". /// - /// This just delegates to [unicode_normalization]. - pub struct FormCNormalization; + /// This just delegates to [unicode_normalization::UnicodeNormalization::nfc]. + /// + /// ``` + /// # use fuzzywuzzy::normalization::{Normalizer, FormCNormalizer}; + /// let a1 = "ä"; // U+00E4 + /// let a2 = "ä"; // U+0061 + U+0308 + /// assert_ne!(a1, a2); + /// assert_eq!(FormCNormalizer.normalize(a2), a1); + /// ``` + pub struct FormCNormalizer; - impl Normalizer for FormCNormalization { + impl Normalizer for FormCNormalizer { fn normalize(&self, s: &str) -> String { s.nfc().collect() } } - /// Performs Unicode Normalization Form KC (compatibility decomposition followed by canonical composition). + /// Performs Unicode Normalization Form KC (compatibility decomposition followed by canonical composition). Requires default feature "normalization". + /// + /// This just delegates to [unicode_normalization::UnicodeNormalization::nfkc]. /// - /// This just delegates to [unicode_normalization]. - pub struct FormKCNormalization; + /// ``` + /// # use fuzzywuzzy::normalization::{Normalizer, FormKCNormalizer}; + /// let a1 = "ä"; // U+00E4 + /// let a2 = "ä"; // U+0061 + U+0308 + /// assert_ne!(a1, a2); + /// assert_eq!(FormKCNormalizer.normalize(a2), a1); + /// ``` + pub struct FormKCNormalizer; - impl Normalizer for FormKCNormalization { + impl Normalizer for FormKCNormalizer { fn normalize(&self, s: &str) -> String { s.nfkc().collect() } } - /// Performs Unicode Normalization Form D (canonical decomposition). + /// Performs Unicode Normalization Form D (canonical decomposition). Requires default feature "normalization". + /// + /// This just delegates to [unicode_normalization::UnicodeNormalization::nfd]. /// - /// This just delegates to [unicode_normalization]. /// ``` - /// # use fuzzywuzzy::normalization::{Normalizer, FormDNormalization}; - /// // FormDNormalization.normalize(U+00E4) == (U+0061 + U+0308) - /// assert_eq!(FormDNormalization.normalize("ä"), "a\u{0308}"); + /// # use fuzzywuzzy::normalization::{Normalizer, FormDNormalizer}; + /// // FormDNormalizer.normalize(U+00E4) == (U+0061 + U+0308) + /// assert_eq!(FormDNormalizer.normalize("ä"), "a\u{0308}"); /// ``` - pub struct FormDNormalization; - impl Normalizer for FormDNormalization { + pub struct FormDNormalizer; + impl Normalizer for FormDNormalizer { fn normalize(&self, s: &str) -> String { s.nfd().collect() } } - /// Performs Unicode Normalization Form KD (compatibility decomposition). + /// Performs Unicode Normalization Form KD (compatibility decomposition). Requires default feature "normalization". + /// + /// This just delegates to [unicode_normalization::UnicodeNormalization::nfkd]. /// - /// This just delegates to [unicode_normalization]. - pub struct FormKDNormalization; - impl Normalizer for FormKDNormalization { + /// ``` + /// # use fuzzywuzzy::normalization::{Normalizer, FormKDNormalizer}; + /// // FormKDNormalizer.normalize(U+00E4) == (U+0061 + U+0308) + /// assert_eq!(FormKDNormalizer.normalize("ä"), "a\u{0308}"); + /// ``` + pub struct FormKDNormalizer; + impl Normalizer for FormKDNormalizer { fn normalize(&self, s: &str) -> String { s.nfkd().collect() } } - /// Performs CJK Compatibility Ideograph-to-Standarized Variation Sequence normalization. + /// Performs CJK Compatibility Ideograph-to-Standarized Variation Sequence normalization. Requires default feature "normalization". /// - /// This just delegates to [unicode_normalization]. "This is not part of the canonical or compatibility decomposition algorithms, but performing it before those algorithms produces normalized output which better preserves the intent of the original text." - pub struct CJKNormalization; - impl Normalizer for CJKNormalization { + /// This just delegates to [unicode_normalization::UnicodeNormalization::cjk_compat_variants]. + /// + /// > "This is not part of the canonical or compatibility decomposition algorithms, but performing it before those algorithms produces normalized output which better preserves the intent of the original text." -- [unicode_normalization](unicode_normalization::UnicodeNormalization::cjk_compat_variants) + pub struct CJKNormalizer; + impl Normalizer for CJKNormalizer { fn normalize(&self, s: &str) -> String { s.cjk_compat_variants().collect() } } - /// Decomposes a string, then removes non-ascii code points. + /// Decomposes a string, then removes non-ascii code points. Requires default feature "normalization". + /// + /// Caution is needed when applying this [Normalizer]. + /// While it may improve Latin-script based language comparisons because they can often decompose largely into ASCII + diacritics, + /// it will perform poorly on less ASCII-centric languages. /// /// ``` - /// # use fuzzywuzzy::normalization::{Normalizer, UnicodeToAsciiNormalization}; + /// # use fuzzywuzzy::normalization::{Normalizer, UnicodeToAsciiNormalizer}; /// // U+00E4 - /// assert_eq!(UnicodeToAsciiNormalization.normalize("ä"), "a"); + /// assert_eq!(UnicodeToAsciiNormalizer.normalize("äbc"), "abc"); /// // U+0061 + U+0308 - /// assert_eq!(UnicodeToAsciiNormalization.normalize("a\u{0308}"), "a"); + /// assert_eq!(UnicodeToAsciiNormalizer.normalize("a\u{0308}bc"), "abc"); + /// // This is probably not what you want! + /// assert_eq!(UnicodeToAsciiNormalizer.normalize("किमप"), ""); /// ``` - pub struct UnicodeToAsciiNormalization; - impl Normalizer for UnicodeToAsciiNormalization { + pub struct UnicodeToAsciiNormalizer; + impl Normalizer for UnicodeToAsciiNormalizer { fn normalize(&self, s: &str) -> String { s.nfd().filter(char::is_ascii).collect() } diff --git a/src/primitives.rs b/src/primitives.rs index 1ab66b7..b5e255c 100644 --- a/src/primitives.rs +++ b/src/primitives.rs @@ -37,7 +37,13 @@ pub fn get_matching_blocks(a: &[T], b: &[T]) -> Vec<(usize, usize, usize) let mut queue = vec![(0, len1, 0, len2)]; let mut matching_blocks = Vec::new(); while let Some((low1, high1, low2, high2)) = queue.pop() { - let (i, j, k) = find_longest_match(shorter, longer, low1, high1, low2, high2); + // TODO: I'd like to convert this function to use MatchingStreak's internally. + // It might make it more clear to be comparing low1 < streak.idx1 instead of low1 < i + let MatchingStreak { + idx1: i, + idx2: j, + size: k, + } = find_longest_match(shorter, longer, low1, high1, low2, high2); debug_assert!(i <= shorter.len()); debug_assert!(j <= longer.len()); if k != 0 { @@ -53,10 +59,13 @@ pub fn get_matching_blocks(a: &[T], b: &[T]) -> Vec<(usize, usize, usize) matching_blocks.sort_unstable(); let (mut i1, mut j1, mut k1) = (0, 0, 0); let mut non_adjacent = Vec::new(); + // collapse adjacent blocks for (i2, j2, k2) in matching_blocks { if i1 + k1 == i2 && j1 + k1 == j2 { + // blocks are adjacent, combine k1 += k2; } else { + // not adjacent, push if it isn't the first dummy block. if k1 != 0 { non_adjacent.push((i1, j1, k1)); } @@ -75,7 +84,39 @@ pub fn get_matching_blocks(a: &[T], b: &[T]) -> Vec<(usize, usize, usize) .collect() } -/// TODO: doc + tests +/// Represents a matching streak of characters between two strings. +/// +/// See [find_longest_match] for details. +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +pub struct MatchingStreak { + /// The index into the first (typically shorter) string where the streak begins. + pub idx1: usize, + /// The index into the second (typically longer) string where the streak begins. + pub idx2: usize, + /// The size of the matching character streak. + pub size: usize, +} + +/// Finds the longest matching streak of characters of `shorter[low1..high1]` in `longer[low2..high2]`. +/// +/// Returned as a [MatchingStreak] where +/// `idx1` is an index into `shorter` where the streak begins, +/// `idx2` is an index into `longer` where the streak begins, +/// and `size` is the length of the streak. +/// +/// ``` +/// # use fuzzywuzzy::segmentation::{Segmenter, CodePointSegmenter}; +/// # use fuzzywuzzy::primitives::{ find_longest_match, MatchingStreak}; +/// let a = CodePointSegmenter.segment("foo bar"); +/// let b = CodePointSegmenter.segment("foo bar baz"); +/// let c = CodePointSegmenter.segment("bar baz"); +/// assert_eq!(find_longest_match(&a, &b, 0, a.len(), 0, b.len()), +/// MatchingStreak{ idx1: 0, idx2: 0, size: 7 }); +/// assert_eq!(find_longest_match(&a, &c, 0, a.len(), 0, c.len()), +/// MatchingStreak{ idx1: 3, idx2: 3, size: 3 }); +/// assert_eq!(find_longest_match(&c, &b, 0, c.len(), 0, b.len()), +/// MatchingStreak{ idx1: 0, idx2: 4, size: 7 }); +/// ``` pub fn find_longest_match( shorter: &[T], longer: &[T], @@ -83,7 +124,7 @@ pub fn find_longest_match( high1: usize, low2: usize, high2: usize, -) -> (usize, usize, usize) { +) -> MatchingStreak { // https://github.com/python-git/python/blob/master/Lib/difflib.py#L351 // algo: // In other words, of all maximal matching blocks, return one that @@ -100,6 +141,7 @@ pub fn find_longest_match( debug_assert!(low2 <= high2); debug_assert!(high1 <= shorter.len()); debug_assert!(high2 <= longer.len()); + debug_assert!(high1 - low1 <= high2 - low2); let longsub = &longer[low2..high2]; let len = high1 - low1; for size in (1..len + 1).rev() { @@ -108,10 +150,18 @@ pub fn find_longest_match( for window_start in 0..((high2 - low2) - size + 1) { let window = &longsub[window_start..window_start + size]; if window == shortsub { - return (low1 + start, low2 + window_start, size); + return MatchingStreak { + idx1: low1 + start, + idx2: low2 + window_start, + size, + }; } } } } - (low1, low2, 0) + MatchingStreak { + idx1: low1, + idx2: low2, + size: 0, + } } diff --git a/src/segmentation.rs b/src/segmentation.rs index 072a980..f0bbbf4 100644 --- a/src/segmentation.rs +++ b/src/segmentation.rs @@ -1,7 +1,7 @@ //! Segmenter trait and default implementations. //! //! Segmentation is how strings are split into tokens for comparison. -//! For example, two strings that *appear* identical might have different byte-level representations. +//! For example, two strings that *visually appear* identical might have different byte-level representations. //! //! Take `ä` and `ä`. Visually, these should be identical. However, the former //! is Unicode character [ä (U+00E4)](https://www.compart.com/en/unicode/U+00E4) @@ -39,6 +39,17 @@ /// Represents a strategy for segmenting a string into units for comparison. /// /// The trait is also implemented for functions matching the signature of the `segment` method. +/// +/// In addition to implementers of the trait, functions with a matching type signature also work. +/// +/// ``` +/// # use fuzzywuzzy::segmentation::{Segmenter, CodePointSegmenter}; +/// let test_string = "test STRING"; +/// fn custom_segmenter(s: &str) -> Vec { s.chars().collect() } +/// assert_eq!( +/// CodePointSegmenter.segment(&test_string), +/// custom_segmenter.segment(&test_string)); +/// ``` pub trait Segmenter<'a> { /// The type of the unit of comparison this strategy operates on. type Output: 'a + Eq; @@ -54,6 +65,14 @@ impl<'a, F: Fn(&str) -> Vec, T: 'a + Eq> Segmenter<'a> for F { } /// A strategy for segmenting strings into their constituent bytes. +/// +/// ``` +/// # use fuzzywuzzy::segmentation::{Segmenter, ByteSegmenter}; +/// // U+00E4 +/// assert_eq!(ByteSegmenter.segment("ä"), vec![0xc3u8, 0xa4u8]); +/// // U+0061 + U+0308 +/// assert_eq!(ByteSegmenter.segment("ä"), vec![0x61u8, 0xccu8, 0x88u8]); +/// ``` pub struct ByteSegmenter; impl<'a> Segmenter<'a> for ByteSegmenter { @@ -71,6 +90,17 @@ impl<'a> Segmenter<'a> for ByteSegmenter { /// Note that `char` is a Unicode Scalar Value which is a subset of Unicode code points disallowing surrogates. /// UTF-8, which all Rust strings are guaranteed to be, also disallows surrogates. /// So all of the Unicode Scalar Values produced here are UTF-8 code points. +/// +/// ``` +/// # use fuzzywuzzy::segmentation::{Segmenter, CodePointSegmenter}; +/// // U+00E4 +/// assert_eq!(CodePointSegmenter.segment("ä"), vec!['ä']); +/// // U+0061 + U+0308 +/// assert_eq!(CodePointSegmenter.segment("ä"), vec!['a', '\u{0308}']); +/// // 'किमपि' (kimapi) and 'किमप' (kimapa) +/// assert_eq!(CodePointSegmenter.segment("किमपि"), vec!['क', 'ि', 'म', 'प', 'ि']); +/// assert_eq!(CodePointSegmenter.segment("किमप"), vec!['क', 'ि', 'म', 'प']); +/// ``` pub struct CodePointSegmenter; impl<'a> Segmenter<'a> for CodePointSegmenter { @@ -91,6 +121,17 @@ mod unicode_segmenters { /// A strategy for segmenting strings into their constituent Unicode graphemes. Requires default feature "segmentation". /// /// This just delegates to [unicode_segmentation]. + /// + /// ``` + /// # use fuzzywuzzy::segmentation::{Segmenter, GraphemeSegmenter}; + /// // U+00E4 + /// assert_eq!(GraphemeSegmenter.segment("ä"), vec!["ä"]); + /// // U+0061 + U+0308 + /// assert_eq!(GraphemeSegmenter.segment("ä"), vec!["ä"]); + /// // 'किमपि' (kimapi) and 'किमप' (kimapa) + /// assert_eq!(GraphemeSegmenter.segment("किमपि"), vec!["कि", "म", "पि"]); + /// assert_eq!(GraphemeSegmenter.segment("किमप"), vec!["कि", "म", "प"]); + /// ``` pub struct GraphemeSegmenter; impl<'a> Segmenter<'a> for GraphemeSegmenter {