Skip to content

Commit

Permalink
add more documentation examples, some renames per PR discussion
Browse files Browse the repository at this point in the history
  • Loading branch information
logannc committed Feb 20, 2021
1 parent 89b6ee6 commit cf2765d
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 48 deletions.
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@ normalization = ["unicode-normalization"]
[dependencies]
unicode-segmentation = { version = "1.7.1", optional = true }
unicode-normalization = { version = "0.1.17", optional = true }

[dev-dependencies]
rand = "0.8.0"
156 changes: 114 additions & 42 deletions src/normalization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,31 @@
//! you might consider the [LowerCaseNormalizer].
//!
//! ```
//! # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer, FormCNormalization, MultipleNormalizer};
//! # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer, FormCNormalizer, ComposedNormalizer};
//! assert_eq!(LowerCaseNormalizer.normalize("this STRING"), LowerCaseNormalizer.normalize("THIS string"));
//! let a1 = "ä"; // U+00E4
//! let a2 = "ä"; // U+0061 + U+0308
//! let a3 = "Ä"; // U+0041 + U+0308
//! assert_ne!(a1, a2);
//! assert_eq!(FormCNormalization.normalize(a2), a1);
//! let multiple_normalizers = MultipleNormalizer::with(vec![Box::new(LowerCaseNormalizer), Box::new(FormCNormalization)]);
//! assert_eq!(FormCNormalizer.normalize(a2), a1);
//! let multiple_normalizers = ComposedNormalizer::with(
//! vec![Box::new(LowerCaseNormalizer), Box::new(FormCNormalizer)]);
//! assert_eq!(multiple_normalizers.normalize(a3), a1);
//! ```
/// Represents a strategy for normalizing string characters into a canonical value of their equivalence class.
///
/// i.e., in a case-insensitive context, 'a' might be the canonical value for the equivalence class of ASCII A's: `['a', 'A']`.
///
/// In addition to implementers of the trait, functions with a matching type signature also work.
/// ```
/// # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer};
/// let test_string = "test STRING";
/// fn custom_normalizer(s: &str) -> String { s.to_lowercase() }
/// assert_eq!(
/// LowerCaseNormalizer.normalize(&test_string),
/// custom_normalizer.normalize(&test_string));
/// ```
pub trait Normalizer {
fn normalize(&self, s: &str) -> String;
}
Expand All @@ -30,6 +41,16 @@ impl<F: Fn(&str) -> String> Normalizer for F {
}

/// Doesn't modify any characters. The Identity-transform [Normalizer].
///
/// ```
/// # use fuzzywuzzy::normalization::{Normalizer, PassthroughNormalizer};
/// use rand::{thread_rng, Rng};
/// use rand::distributions::Alphanumeric;
/// let random_string: String = thread_rng()
/// .sample_iter(&Alphanumeric)
/// .take(16).map(char::from).collect();
/// assert_eq!(PassthroughNormalizer.normalize(&random_string), random_string);
/// ```
pub struct PassthroughNormalizer;

impl Normalizer for PassthroughNormalizer {
Expand All @@ -38,21 +59,28 @@ impl Normalizer for PassthroughNormalizer {
}
}

// ew, need a better name
/// Compose a sequence of [Normalizer]s together into one [Normalizer].
///
/// They are executed in order.
pub struct MultipleNormalizer {
/// They are executed in sequential order.
/// ```
/// # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer, FormCNormalizer, ComposedNormalizer};
/// let a1 = "ä"; // U+00E4
/// let a2 = "Ä"; // U+0041 + U+0308
/// let multiple_normalizers = ComposedNormalizer::with(
/// vec![Box::new(LowerCaseNormalizer), Box::new(FormCNormalizer)]);
/// assert_eq!(multiple_normalizers.normalize(a2), a1);
/// ```
pub struct ComposedNormalizer {
normalizers: Vec<Box<dyn Normalizer>>,
}

impl MultipleNormalizer {
pub fn with(normalizers: Vec<Box<dyn Normalizer>>) -> MultipleNormalizer {
MultipleNormalizer { normalizers }
impl ComposedNormalizer {
pub fn with(normalizers: Vec<Box<dyn Normalizer>>) -> ComposedNormalizer {
ComposedNormalizer { normalizers }
}
}

impl Normalizer for MultipleNormalizer {
impl Normalizer for ComposedNormalizer {
fn normalize(&self, s: &str) -> String {
let mut current = s.to_owned();
for normalizer in self.normalizers.iter() {
Expand All @@ -63,6 +91,11 @@ impl Normalizer for MultipleNormalizer {
}

/// Normalizes strings by lower-casing all letters.
///
/// ```
/// # use fuzzywuzzy::normalization::{Normalizer, LowerCaseNormalizer};
/// assert_eq!(LowerCaseNormalizer.normalize("this STRING"), LowerCaseNormalizer.normalize("THIS string"));
/// ```
pub struct LowerCaseNormalizer;

impl Normalizer for LowerCaseNormalizer {
Expand All @@ -72,9 +105,17 @@ impl Normalizer for LowerCaseNormalizer {
}

/// Removes non-ASCII codepoints.
pub struct AsciiOnlyFilter;

impl Normalizer for AsciiOnlyFilter {
///
/// Notably, this does not ASCII-ify non-ASCII characters, it just removes them.
/// If you want to turn characters into ASCII decompositions, look at
/// [FormDNormalizer], [FormKDNormalizer], or [UnicodeToAsciiNormalizer].
/// ```
/// # use fuzzywuzzy::normalization::{Normalizer, AsciiOnlyNormalizer};
/// assert_eq!(AsciiOnlyNormalizer.normalize("äbc"), "bc");
/// ```
pub struct AsciiOnlyNormalizer;

impl Normalizer for AsciiOnlyNormalizer {
fn normalize(&self, s: &str) -> String {
s.chars().filter(char::is_ascii).collect()
}
Expand All @@ -88,74 +129,105 @@ mod unicode_normalizers {
use super::Normalizer;
use unicode_normalization::UnicodeNormalization;

/// Performs Unicode Normalization Form C (canonical decomposition followed by canonical composition).
/// Performs Unicode Normalization Form C (canonical decomposition followed by canonical composition). Requires default feature "normalization".
///
/// This just delegates to [unicode_normalization].
pub struct FormCNormalization;
/// This just delegates to [unicode_normalization::UnicodeNormalization::nfc].
///
/// ```
/// # use fuzzywuzzy::normalization::{Normalizer, FormCNormalizer};
/// let a1 = "ä"; // U+00E4
/// let a2 = "ä"; // U+0061 + U+0308
/// assert_ne!(a1, a2);
/// assert_eq!(FormCNormalizer.normalize(a2), a1);
/// ```
pub struct FormCNormalizer;

impl Normalizer for FormCNormalization {
impl Normalizer for FormCNormalizer {
fn normalize(&self, s: &str) -> String {
s.nfc().collect()
}
}

/// Performs Unicode Normalization Form KC (compatibility decomposition followed by canonical composition).
/// Performs Unicode Normalization Form KC (compatibility decomposition followed by canonical composition). Requires default feature "normalization".
///
/// This just delegates to [unicode_normalization::UnicodeNormalization::nfkc].
///
/// This just delegates to [unicode_normalization].
pub struct FormKCNormalization;
/// ```
/// # use fuzzywuzzy::normalization::{Normalizer, FormKCNormalizer};
/// let a1 = "ä"; // U+00E4
/// let a2 = "ä"; // U+0061 + U+0308
/// assert_ne!(a1, a2);
/// assert_eq!(FormKCNormalizer.normalize(a2), a1);
/// ```
pub struct FormKCNormalizer;

impl Normalizer for FormKCNormalization {
impl Normalizer for FormKCNormalizer {
fn normalize(&self, s: &str) -> String {
s.nfkc().collect()
}
}

/// Performs Unicode Normalization Form D (canonical decomposition).
/// Performs Unicode Normalization Form D (canonical decomposition). Requires default feature "normalization".
///
/// This just delegates to [unicode_normalization::UnicodeNormalization::nfd].
///
/// This just delegates to [unicode_normalization].
/// ```
/// # use fuzzywuzzy::normalization::{Normalizer, FormDNormalization};
/// // FormDNormalization.normalize(U+00E4) == (U+0061 + U+0308)
/// assert_eq!(FormDNormalization.normalize("ä"), "a\u{0308}");
/// # use fuzzywuzzy::normalization::{Normalizer, FormDNormalizer};
/// // FormDNormalizer.normalize(U+00E4) == (U+0061 + U+0308)
/// assert_eq!(FormDNormalizer.normalize("ä"), "a\u{0308}");
/// ```
pub struct FormDNormalization;
impl Normalizer for FormDNormalization {
pub struct FormDNormalizer;
impl Normalizer for FormDNormalizer {
fn normalize(&self, s: &str) -> String {
s.nfd().collect()
}
}

/// Performs Unicode Normalization Form KD (compatibility decomposition).
/// Performs Unicode Normalization Form KD (compatibility decomposition). Requires default feature "normalization".
///
/// This just delegates to [unicode_normalization::UnicodeNormalization::nfkd].
///
/// This just delegates to [unicode_normalization].
pub struct FormKDNormalization;
impl Normalizer for FormKDNormalization {
/// ```
/// # use fuzzywuzzy::normalization::{Normalizer, FormKDNormalizer};
/// // FormKDNormalizer.normalize(U+00E4) == (U+0061 + U+0308)
/// assert_eq!(FormKDNormalizer.normalize("ä"), "a\u{0308}");
/// ```
pub struct FormKDNormalizer;
impl Normalizer for FormKDNormalizer {
fn normalize(&self, s: &str) -> String {
s.nfkd().collect()
}
}

/// Performs CJK Compatibility Ideograph-to-Standarized Variation Sequence normalization.
/// Performs CJK Compatibility Ideograph-to-Standarized Variation Sequence normalization. Requires default feature "normalization".
///
/// This just delegates to [unicode_normalization]. "This is not part of the canonical or compatibility decomposition algorithms, but performing it before those algorithms produces normalized output which better preserves the intent of the original text."
pub struct CJKNormalization;
impl Normalizer for CJKNormalization {
/// This just delegates to [unicode_normalization::UnicodeNormalization::cjk_compat_variants].
///
/// > "This is not part of the canonical or compatibility decomposition algorithms, but performing it before those algorithms produces normalized output which better preserves the intent of the original text." -- [unicode_normalization](unicode_normalization::UnicodeNormalization::cjk_compat_variants)
pub struct CJKNormalizer;
impl Normalizer for CJKNormalizer {
fn normalize(&self, s: &str) -> String {
s.cjk_compat_variants().collect()
}
}

/// Decomposes a string, then removes non-ascii code points.
/// Decomposes a string, then removes non-ascii code points. Requires default feature "normalization".
///
/// Caution is needed when applying this [Normalizer].
/// While it may improve Latin-script based language comparisons because they can often decompose largely into ASCII + diacritics,
/// it will perform poorly on less ASCII-centric languages.
///
/// ```
/// # use fuzzywuzzy::normalization::{Normalizer, UnicodeToAsciiNormalization};
/// # use fuzzywuzzy::normalization::{Normalizer, UnicodeToAsciiNormalizer};
/// // U+00E4
/// assert_eq!(UnicodeToAsciiNormalization.normalize("ä"), "a");
/// assert_eq!(UnicodeToAsciiNormalizer.normalize("äbc"), "abc");
/// // U+0061 + U+0308
/// assert_eq!(UnicodeToAsciiNormalization.normalize("a\u{0308}"), "a");
/// assert_eq!(UnicodeToAsciiNormalizer.normalize("a\u{0308}bc"), "abc");
/// // This is probably not what you want!
/// assert_eq!(UnicodeToAsciiNormalizer.normalize("किमप"), "");
/// ```
pub struct UnicodeToAsciiNormalization;
impl Normalizer for UnicodeToAsciiNormalization {
pub struct UnicodeToAsciiNormalizer;
impl Normalizer for UnicodeToAsciiNormalizer {
fn normalize(&self, s: &str) -> String {
s.nfd().filter(char::is_ascii).collect()
}
Expand Down
60 changes: 55 additions & 5 deletions src/primitives.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@ pub fn get_matching_blocks<T: Eq>(a: &[T], b: &[T]) -> Vec<(usize, usize, usize)
let mut queue = vec![(0, len1, 0, len2)];
let mut matching_blocks = Vec::new();
while let Some((low1, high1, low2, high2)) = queue.pop() {
let (i, j, k) = find_longest_match(shorter, longer, low1, high1, low2, high2);
// TODO: I'd like to convert this function to use MatchingStreak's internally.
// It might make it more clear to be comparing low1 < streak.idx1 instead of low1 < i
let MatchingStreak {
idx1: i,
idx2: j,
size: k,
} = find_longest_match(shorter, longer, low1, high1, low2, high2);
debug_assert!(i <= shorter.len());
debug_assert!(j <= longer.len());
if k != 0 {
Expand All @@ -53,10 +59,13 @@ pub fn get_matching_blocks<T: Eq>(a: &[T], b: &[T]) -> Vec<(usize, usize, usize)
matching_blocks.sort_unstable();
let (mut i1, mut j1, mut k1) = (0, 0, 0);
let mut non_adjacent = Vec::new();
// collapse adjacent blocks
for (i2, j2, k2) in matching_blocks {
if i1 + k1 == i2 && j1 + k1 == j2 {
// blocks are adjacent, combine
k1 += k2;
} else {
// not adjacent, push if it isn't the first dummy block.
if k1 != 0 {
non_adjacent.push((i1, j1, k1));
}
Expand All @@ -75,15 +84,47 @@ pub fn get_matching_blocks<T: Eq>(a: &[T], b: &[T]) -> Vec<(usize, usize, usize)
.collect()
}

/// TODO: doc + tests
/// Represents a matching streak of characters between two strings.
///
/// See [find_longest_match] for details.
#[derive(PartialEq, Eq, Copy, Clone, Debug)]
pub struct MatchingStreak {
/// The index into the first (typically shorter) string where the streak begins.
pub idx1: usize,
/// The index into the second (typically longer) string where the streak begins.
pub idx2: usize,
/// The size of the matching character streak.
pub size: usize,
}

/// Finds the longest matching streak of characters of `shorter[low1..high1]` in `longer[low2..high2]`.
///
/// Returned as a [MatchingStreak] where
/// `idx1` is an index into `shorter` where the streak begins,
/// `idx2` is an index into `longer` where the streak begins,
/// and `size` is the length of the streak.
///
/// ```
/// # use fuzzywuzzy::segmentation::{Segmenter, CodePointSegmenter};
/// # use fuzzywuzzy::primitives::{ find_longest_match, MatchingStreak};
/// let a = CodePointSegmenter.segment("foo bar");
/// let b = CodePointSegmenter.segment("foo bar baz");
/// let c = CodePointSegmenter.segment("bar baz");
/// assert_eq!(find_longest_match(&a, &b, 0, a.len(), 0, b.len()),
/// MatchingStreak{ idx1: 0, idx2: 0, size: 7 });
/// assert_eq!(find_longest_match(&a, &c, 0, a.len(), 0, c.len()),
/// MatchingStreak{ idx1: 3, idx2: 3, size: 3 });
/// assert_eq!(find_longest_match(&c, &b, 0, c.len(), 0, b.len()),
/// MatchingStreak{ idx1: 0, idx2: 4, size: 7 });
/// ```
pub fn find_longest_match<T: Eq>(
shorter: &[T],
longer: &[T],
low1: usize,
high1: usize,
low2: usize,
high2: usize,
) -> (usize, usize, usize) {
) -> MatchingStreak {
// https://github.com/python-git/python/blob/master/Lib/difflib.py#L351
// algo:
// In other words, of all maximal matching blocks, return one that
Expand All @@ -100,6 +141,7 @@ pub fn find_longest_match<T: Eq>(
debug_assert!(low2 <= high2);
debug_assert!(high1 <= shorter.len());
debug_assert!(high2 <= longer.len());
debug_assert!(high1 - low1 <= high2 - low2);
let longsub = &longer[low2..high2];
let len = high1 - low1;
for size in (1..len + 1).rev() {
Expand All @@ -108,10 +150,18 @@ pub fn find_longest_match<T: Eq>(
for window_start in 0..((high2 - low2) - size + 1) {
let window = &longsub[window_start..window_start + size];
if window == shortsub {
return (low1 + start, low2 + window_start, size);
return MatchingStreak {
idx1: low1 + start,
idx2: low2 + window_start,
size,
};
}
}
}
}
(low1, low2, 0)
MatchingStreak {
idx1: low1,
idx2: low2,
size: 0,
}
}
Loading

0 comments on commit cf2765d

Please sign in to comment.