add more documentation examples, some renames per PR discussion

logannc · Feb 20, 2021 · cf2765d · cf2765d
1 parent 89b6ee6
commit cf2765d
Show file tree

Hide file tree

Showing 4 changed files with 214 additions and 48 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -19,3 +19,6 @@ normalization = ["unicode-normalization"]
 [dependencies]
 unicode-segmentation = { version = "1.7.1", optional = true }
 unicode-normalization = { version = "0.1.17", optional = true }
+
+[dev-dependencies]
+rand = "0.8.0"
diff --git a/src/normalization.rs b/src/normalization.rs
@@ -5,20 +5,31 @@
 //! you might consider the [LowerCaseNormalizer].
 //!
 //! ```
-//! # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer, FormCNormalization, MultipleNormalizer};
+//! # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer, FormCNormalizer, ComposedNormalizer};
 //! assert_eq!(LowerCaseNormalizer.normalize("this STRING"), LowerCaseNormalizer.normalize("THIS string"));
 //! let a1 = "ä"; // U+00E4
 //! let a2 = "ä"; // U+0061 + U+0308
 //! let a3 = "Ä"; // U+0041 + U+0308
 //! assert_ne!(a1, a2);
-//! assert_eq!(FormCNormalization.normalize(a2), a1);
-//! let multiple_normalizers = MultipleNormalizer::with(vec![Box::new(LowerCaseNormalizer), Box::new(FormCNormalization)]);
+//! assert_eq!(FormCNormalizer.normalize(a2), a1);
+//! let multiple_normalizers = ComposedNormalizer::with(
+//!     vec![Box::new(LowerCaseNormalizer), Box::new(FormCNormalizer)]);
 //! assert_eq!(multiple_normalizers.normalize(a3), a1);
 //! ```
 
 /// Represents a strategy for normalizing string characters into a canonical value of their equivalence class.
 ///
 /// i.e., in a case-insensitive context, 'a' might be the canonical value for the equivalence class of ASCII A's: `['a', 'A']`.
+///
+/// In addition to implementers of the trait, functions with a matching type signature also work.
+/// ```
+/// # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer};
+/// let test_string = "test STRING";
+/// fn custom_normalizer(s: &str) -> String { s.to_lowercase() }
+/// assert_eq!(
+///    LowerCaseNormalizer.normalize(&test_string),
+///    custom_normalizer.normalize(&test_string));
+/// ```
 pub trait Normalizer {
     fn normalize(&self, s: &str) -> String;
 }
@@ -30,6 +41,16 @@ impl<F: Fn(&str) -> String> Normalizer for F {
 }
 
 /// Doesn't modify any characters. The Identity-transform [Normalizer].
+///
+/// ```
+/// # use fuzzywuzzy::normalization::{Normalizer, PassthroughNormalizer};
+/// use rand::{thread_rng, Rng};
+/// use rand::distributions::Alphanumeric;
+/// let random_string: String = thread_rng()
+///              .sample_iter(&Alphanumeric)
+///              .take(16).map(char::from).collect();
+/// assert_eq!(PassthroughNormalizer.normalize(&random_string), random_string);
+/// ```
 pub struct PassthroughNormalizer;
 
 impl Normalizer for PassthroughNormalizer {
@@ -38,21 +59,28 @@ impl Normalizer for PassthroughNormalizer {
     }
 }
 
-// ew, need a better name
 /// Compose a sequence of [Normalizer]s together into one [Normalizer].
 ///
-/// They are executed in order.
-pub struct MultipleNormalizer {
+/// They are executed in sequential order.
+/// ```
+/// # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer, FormCNormalizer, ComposedNormalizer};
+/// let a1 = "ä"; // U+00E4
+/// let a2 = "Ä"; // U+0041 + U+0308
+/// let multiple_normalizers = ComposedNormalizer::with(
+///     vec![Box::new(LowerCaseNormalizer), Box::new(FormCNormalizer)]);
+/// assert_eq!(multiple_normalizers.normalize(a2), a1);
+/// ```
+pub struct ComposedNormalizer {
     normalizers: Vec<Box<dyn Normalizer>>,
 }
 
-impl MultipleNormalizer {
-    pub fn with(normalizers: Vec<Box<dyn Normalizer>>) -> MultipleNormalizer {
-        MultipleNormalizer { normalizers }
+impl ComposedNormalizer {
+    pub fn with(normalizers: Vec<Box<dyn Normalizer>>) -> ComposedNormalizer {
+        ComposedNormalizer { normalizers }
     }
 }
 
-impl Normalizer for MultipleNormalizer {
+impl Normalizer for ComposedNormalizer {
     fn normalize(&self, s: &str) -> String {
         let mut current = s.to_owned();
         for normalizer in self.normalizers.iter() {
@@ -63,6 +91,11 @@ impl Normalizer for MultipleNormalizer {
 }
 
 /// Normalizes strings by lower-casing all letters.
+///
+/// ```
+/// # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer};
+/// assert_eq!(LowerCaseNormalizer.normalize("this STRING"), LowerCaseNormalizer.normalize("THIS string"));
+/// ```
 pub struct LowerCaseNormalizer;
 
 impl Normalizer for LowerCaseNormalizer {
@@ -72,9 +105,17 @@ impl Normalizer for LowerCaseNormalizer {
 }
 
 /// Removes non-ASCII codepoints.
-pub struct AsciiOnlyFilter;
-
-impl Normalizer for AsciiOnlyFilter {
+///
+/// Notably, this does not ASCII-ify non-ASCII characters, it just removes them.
+/// If you want to turn characters into ASCII decompositions, look at
+/// [FormDNormalizer], [FormKDNormalizer], or [UnicodeToAsciiNormalizer].
+/// ```
+/// # use fuzzywuzzy::normalization::{Normalizer, AsciiOnlyNormalizer};
+/// assert_eq!(AsciiOnlyNormalizer.normalize("äbc"), "bc");
+/// ```
+pub struct AsciiOnlyNormalizer;
+
+impl Normalizer for AsciiOnlyNormalizer {
     fn normalize(&self, s: &str) -> String {
         s.chars().filter(char::is_ascii).collect()
     }
@@ -88,74 +129,105 @@ mod unicode_normalizers {
     use super::Normalizer;
     use unicode_normalization::UnicodeNormalization;
 
-    /// Performs Unicode Normalization Form C (canonical decomposition followed by canonical composition).
+    /// Performs Unicode Normalization Form C (canonical decomposition followed by canonical composition). Requires default feature "normalization".
     ///
-    /// This just delegates to [unicode_normalization].
-    pub struct FormCNormalization;
+    /// This just delegates to [unicode_normalization::UnicodeNormalization::nfc].
+    ///
+    /// ```
+    /// # use fuzzywuzzy::normalization::{Normalizer, FormCNormalizer};
+    /// let a1 = "ä"; // U+00E4
+    /// let a2 = "ä"; // U+0061 + U+0308
+    /// assert_ne!(a1, a2);
+    /// assert_eq!(FormCNormalizer.normalize(a2), a1);
+    /// ```
+    pub struct FormCNormalizer;
 
-    impl Normalizer for FormCNormalization {
+    impl Normalizer for FormCNormalizer {
         fn normalize(&self, s: &str) -> String {
             s.nfc().collect()
         }
     }
 
-    /// Performs Unicode Normalization Form KC (compatibility decomposition followed by canonical composition).
+    /// Performs Unicode Normalization Form KC (compatibility decomposition followed by canonical composition). Requires default feature "normalization".
+    ///
+    /// This just delegates to [unicode_normalization::UnicodeNormalization::nfkc].
     ///
-    /// This just delegates to [unicode_normalization].
-    pub struct FormKCNormalization;
+    /// ```
+    /// # use fuzzywuzzy::normalization::{Normalizer, FormKCNormalizer};
+    /// let a1 = "ä"; // U+00E4
+    /// let a2 = "ä"; // U+0061 + U+0308
+    /// assert_ne!(a1, a2);
+    /// assert_eq!(FormKCNormalizer.normalize(a2), a1);
+    /// ```
+    pub struct FormKCNormalizer;
 
-    impl Normalizer for FormKCNormalization {
+    impl Normalizer for FormKCNormalizer {
         fn normalize(&self, s: &str) -> String {
             s.nfkc().collect()
         }
     }
 
-    /// Performs Unicode Normalization Form D (canonical decomposition).
+    /// Performs Unicode Normalization Form D (canonical decomposition). Requires default feature "normalization".
+    ///
+    /// This just delegates to [unicode_normalization::UnicodeNormalization::nfd].
     ///
-    /// This just delegates to [unicode_normalization].
     /// ```
-    /// # use fuzzywuzzy::normalization::{Normalizer, FormDNormalization};
-    /// // FormDNormalization.normalize(U+00E4) == (U+0061 + U+0308)
-    /// assert_eq!(FormDNormalization.normalize("ä"), "a\u{0308}");
+    /// # use fuzzywuzzy::normalization::{Normalizer, FormDNormalizer};
+    /// // FormDNormalizer.normalize(U+00E4) == (U+0061 + U+0308)
+    /// assert_eq!(FormDNormalizer.normalize("ä"), "a\u{0308}");
     /// ```
-    pub struct FormDNormalization;
-    impl Normalizer for FormDNormalization {
+    pub struct FormDNormalizer;
+    impl Normalizer for FormDNormalizer {
         fn normalize(&self, s: &str) -> String {
             s.nfd().collect()
         }
     }
 
-    /// Performs Unicode Normalization Form KD (compatibility decomposition).
+    /// Performs Unicode Normalization Form KD (compatibility decomposition). Requires default feature "normalization".
+    ///
+    /// This just delegates to [unicode_normalization::UnicodeNormalization::nfkd].
     ///
-    /// This just delegates to [unicode_normalization].
-    pub struct FormKDNormalization;
-    impl Normalizer for FormKDNormalization {
+    /// ```
+    /// # use fuzzywuzzy::normalization::{Normalizer, FormKDNormalizer};
+    /// // FormKDNormalizer.normalize(U+00E4) == (U+0061 + U+0308)
+    /// assert_eq!(FormKDNormalizer.normalize("ä"), "a\u{0308}");
+    /// ```
+    pub struct FormKDNormalizer;
+    impl Normalizer for FormKDNormalizer {
         fn normalize(&self, s: &str) -> String {
             s.nfkd().collect()
         }
     }
 
-    /// Performs CJK Compatibility Ideograph-to-Standarized Variation Sequence normalization.
+    /// Performs CJK Compatibility Ideograph-to-Standarized Variation Sequence normalization. Requires default feature "normalization".
     ///
-    /// This just delegates to [unicode_normalization]. "This is not part of the canonical or compatibility decomposition algorithms, but performing it before those algorithms produces normalized output which better preserves the intent of the original text."
-    pub struct CJKNormalization;
-    impl Normalizer for CJKNormalization {
+    /// This just delegates to [unicode_normalization::UnicodeNormalization::cjk_compat_variants].
+    ///
+    /// > "This is not part of the canonical or compatibility decomposition algorithms, but performing it before those algorithms produces normalized output which better preserves the intent of the original text." -- [unicode_normalization](unicode_normalization::UnicodeNormalization::cjk_compat_variants)
+    pub struct CJKNormalizer;
+    impl Normalizer for CJKNormalizer {
         fn normalize(&self, s: &str) -> String {
             s.cjk_compat_variants().collect()
         }
     }
 
-    /// Decomposes a string, then removes non-ascii code points.
+    /// Decomposes a string, then removes non-ascii code points. Requires default feature "normalization".
+    ///
+    /// Caution is needed when applying this [Normalizer].
+    /// While it may improve Latin-script based language comparisons because they can often decompose largely into ASCII + diacritics,
+    /// it will perform poorly on less ASCII-centric languages.
     ///
     /// ```
-    /// # use fuzzywuzzy::normalization::{Normalizer, UnicodeToAsciiNormalization};
+    /// # use fuzzywuzzy::normalization::{Normalizer, UnicodeToAsciiNormalizer};
     /// // U+00E4
-    /// assert_eq!(UnicodeToAsciiNormalization.normalize("ä"), "a");
+    /// assert_eq!(UnicodeToAsciiNormalizer.normalize("äbc"), "abc");
     /// // U+0061 + U+0308
-    /// assert_eq!(UnicodeToAsciiNormalization.normalize("a\u{0308}"), "a");
+    /// assert_eq!(UnicodeToAsciiNormalizer.normalize("a\u{0308}bc"), "abc");
+    /// // This is probably not what you want!
+    /// assert_eq!(UnicodeToAsciiNormalizer.normalize("किमप"), "");
     /// ```
-    pub struct UnicodeToAsciiNormalization;
-    impl Normalizer for UnicodeToAsciiNormalization {
+    pub struct UnicodeToAsciiNormalizer;
+    impl Normalizer for UnicodeToAsciiNormalizer {
         fn normalize(&self, s: &str) -> String {
             s.nfd().filter(char::is_ascii).collect()
         }

diff --git a/src/primitives.rs b/src/primitives.rs
@@ -37,7 +37,13 @@ pub fn get_matching_blocks<T: Eq>(a: &[T], b: &[T]) -> Vec<(usize, usize, usize)
     let mut queue = vec![(0, len1, 0, len2)];
     let mut matching_blocks = Vec::new();
     while let Some((low1, high1, low2, high2)) = queue.pop() {
-        let (i, j, k) = find_longest_match(shorter, longer, low1, high1, low2, high2);
+        // TODO: I'd like to convert this function to use MatchingStreak's internally.
+        // It might make it more clear to be comparing low1 < streak.idx1 instead of low1 < i
+        let MatchingStreak {
+            idx1: i,
+            idx2: j,
+            size: k,
+        } = find_longest_match(shorter, longer, low1, high1, low2, high2);
         debug_assert!(i <= shorter.len());
         debug_assert!(j <= longer.len());
         if k != 0 {
@@ -53,10 +59,13 @@ pub fn get_matching_blocks<T: Eq>(a: &[T], b: &[T]) -> Vec<(usize, usize, usize)
     matching_blocks.sort_unstable();
     let (mut i1, mut j1, mut k1) = (0, 0, 0);
     let mut non_adjacent = Vec::new();
+    // collapse adjacent blocks
     for (i2, j2, k2) in matching_blocks {
         if i1 + k1 == i2 && j1 + k1 == j2 {
+            // blocks are adjacent, combine
             k1 += k2;
         } else {
+            // not adjacent, push if it isn't the first dummy block.
             if k1 != 0 {
                 non_adjacent.push((i1, j1, k1));
             }
@@ -75,15 +84,47 @@ pub fn get_matching_blocks<T: Eq>(a: &[T], b: &[T]) -> Vec<(usize, usize, usize)
         .collect()
 }
 
-/// TODO: doc + tests
+/// Represents a matching streak of characters between two strings.
+///
+/// See [find_longest_match] for details.
+#[derive(PartialEq, Eq, Copy, Clone, Debug)]
+pub struct MatchingStreak {
+    /// The index into the first (typically shorter) string where the streak begins.
+    pub idx1: usize,
+    /// The index into the second (typically longer) string where the streak begins.
+    pub idx2: usize,
+    /// The size of the matching character streak.
+    pub size: usize,
+}
+
+/// Finds the longest matching streak of characters of `shorter[low1..high1]` in `longer[low2..high2]`.
+///
+/// Returned as a [MatchingStreak] where
+/// `idx1` is an index into `shorter` where the streak begins,
+/// `idx2` is an index into `longer` where the streak begins,
+/// and `size` is the length of the streak.
+///
+/// ```
+/// # use fuzzywuzzy::segmentation::{Segmenter, CodePointSegmenter};
+/// # use fuzzywuzzy::primitives::{ find_longest_match, MatchingStreak};
+/// let a = CodePointSegmenter.segment("foo bar");
+/// let b = CodePointSegmenter.segment("foo bar baz");
+/// let c = CodePointSegmenter.segment("bar baz");
+/// assert_eq!(find_longest_match(&a, &b, 0, a.len(), 0, b.len()),
+///                               MatchingStreak{ idx1: 0, idx2: 0, size: 7 });
+/// assert_eq!(find_longest_match(&a, &c, 0, a.len(), 0, c.len()),
+///                               MatchingStreak{ idx1: 3, idx2: 3, size: 3 });
+/// assert_eq!(find_longest_match(&c, &b, 0, c.len(), 0, b.len()),
+///                               MatchingStreak{ idx1: 0, idx2: 4, size: 7 });
+/// ```
 pub fn find_longest_match<T: Eq>(
     shorter: &[T],
     longer: &[T],
     low1: usize,
     high1: usize,
     low2: usize,
     high2: usize,
-) -> (usize, usize, usize) {
+) -> MatchingStreak {
     // https://github.com/python-git/python/blob/master/Lib/difflib.py#L351
     // algo:
     //  In other words, of all maximal matching blocks, return one that
@@ -100,6 +141,7 @@ pub fn find_longest_match<T: Eq>(
     debug_assert!(low2 <= high2);
     debug_assert!(high1 <= shorter.len());
     debug_assert!(high2 <= longer.len());
+    debug_assert!(high1 - low1 <= high2 - low2);
     let longsub = &longer[low2..high2];
     let len = high1 - low1;
     for size in (1..len + 1).rev() {
@@ -108,10 +150,18 @@ pub fn find_longest_match<T: Eq>(
             for window_start in 0..((high2 - low2) - size + 1) {
                 let window = &longsub[window_start..window_start + size];
                 if window == shortsub {
-                    return (low1 + start, low2 + window_start, size);
+                    return MatchingStreak {
+                        idx1: low1 + start,
+                        idx2: low2 + window_start,
+                        size,
+                    };
                 }
             }
         }
     }
-    (low1, low2, 0)
+    MatchingStreak {
+        idx1: low1,
+        idx2: low2,
+        size: 0,
+    }
 }