Implement WRatio

Signed-off-by: Sean Pianka <[email protected]>
logannc · Sep 13, 2020 · 92de474 · 92de474
1 parent 906e322
commit 92de474
Show file tree

Hide file tree

Showing 2 changed files with 93 additions and 4 deletions.
diff --git a/src/fuzz.rs b/src/fuzz.rs
@@ -17,6 +17,7 @@ pub fn ratio(s1: &str, s2: &str) -> u8 {
     }
 }
 
+/// Return the ratio of the most similar substring as a number between 0 and 100.
 pub fn partial_ratio(s1: &str, s2: &str) -> u8 {
     let (shorter, longer) = if s1.len() <= s2.len() {
         (s1.to_string(), s2.to_string())
@@ -43,6 +44,7 @@ pub fn partial_ratio(s1: &str, s2: &str) -> u8 {
     max
 }
 
+/// Return a cleaned string with token sorted.
 fn process_and_sort(s: &str, force_ascii: bool, full_process: bool) -> String {
     let ts = if full_process {
         utils::full_process(s, force_ascii)
@@ -54,6 +56,10 @@ fn process_and_sort(s: &str, force_ascii: bool, full_process: bool) -> String {
     ts_split.join(" ")
 }
 
+/// Sorted Token
+/// # find all alphanumeric tokens in the string
+/// # sort those tokens and take ratio of resulting joined strings
+/// # controls for unordered string elements
 fn token_sort(s1: &str, s2: &str, partial: bool, force_ascii: bool, full_process: bool) -> u8 {
     let sorted1 = process_and_sort(s1, force_ascii, full_process);
     let sorted2 = process_and_sort(s2, force_ascii, full_process);
@@ -64,14 +70,23 @@ fn token_sort(s1: &str, s2: &str, partial: bool, force_ascii: bool, full_process
     }
 }
 
+/// Return a measure of the sequences' similarity between 0 and 100, but sort the token before
+/// comparing.
 pub fn token_sort_ratio(s1: &str, s2: &str, force_ascii: bool, full_process: bool) -> u8 {
     token_sort(s1, s2, false, force_ascii, full_process)
 }
 
+/// Return the ratio of the most similar substring as a number between 0 and 100, but sort the token
+/// before comparing.
 pub fn partial_token_sort_ratio(s1: &str, s2: &str, force_ascii: bool, full_process: bool) -> u8 {
     token_sort(s1, s2, true, force_ascii, full_process)
 }
 
+/// Find all alphanumeric tokens in each string...
+///  # treat them as a set
+///  # construct two strings of the form: <sorted_intersection><sorted_remainder>
+///  # take ratios of those two strings
+///  # controls for unordered partial matches
 fn token_set(s1: &str, s2: &str, partial: bool, force_ascii: bool, full_process: bool) -> u8 {
     let (p1, p2) = if full_process {
         (utils::full_process(s1, force_ascii), utils::full_process(s2, force_ascii))
@@ -126,11 +141,82 @@ pub fn partial_token_set_ratio(s1: &str, s2: &str, force_ascii: bool, full_proce
     token_set(s1, s2, true, force_ascii, full_process)
 }
 
+/// Quick ratio comparison between two strings.
+///
+//  Runs utils::full_process on both strings.
+//  Short circuits if either of the strings is empty after processing.
 pub fn qratio(s1: &str, s2: &str, force_ascii: bool) -> u8 {
     let (p1, p2) = (utils::full_process(s1, force_ascii), utils::full_process(s2, force_ascii));
+    if !utils::validate_string(p1.as_str()) || !utils::validate_string(p2.as_str()) {
+        return 0;
+    }
     ratio(&p1, &p2)
 }
 
 pub fn uqratio(s1: &str, s2: &str) -> u8 {
     qratio(s1, s2, false)
-}
+}
+
+/// Return a measure of the sequences' similarity between 0 and 100, using different algorithms.
+///
+/// ** Steps in the order they occur **
+/// #. Run full_process from utils on both strings
+/// #. Short circuit if this makes either string empty
+/// #. Take the ratio of the two processed strings (fuzz.ratio)
+/// #. Run checks to compare the length of the strings
+///     * If one of the strings is more than 1.5 times as long as the other
+///       use partial_ratio comparisons - scale partial results by 0.9
+///       (this makes sure only full results can return 100)
+///     * If one of the strings is over 8 times as long as the other
+///       instead scale by 0.6
+/// #. Run the other ratio functions
+///     * if using partial ratio functions call partial_ratio,
+///       partial_token_sort_ratio and partial_token_set_ratio
+///       scale all of these by the ratio based on length
+///     * otherwise call token_sort_ratio and token_set_ratio
+///     * all token based comparisons are scaled by 0.95
+///       (on top of any partial scalars)
+/// #. Take the highest value from these results
+///    round it and return it as an integer.
+pub fn wratio(s1: &str, s2: &str, force_ascii: bool, full_process: bool) -> u8 {
+    let (p1, p2) = if full_process {
+        (utils::full_process(s1, force_ascii), utils::full_process(s2, force_ascii))
+    } else {
+        (s1.to_string(), s2.to_string())
+    };
+    let (p1r, p2r) = (p1.as_str(), p2.as_str());
+    if !utils::validate_string(p1r) || !utils::validate_string(p2r) {
+        return 0;
+    }
+    let mut try_partial = true;
+    let unbase_scale = 0.95;
+    let mut partial_scale = 0.90;
+
+    let base = ratio(p1r, p2r);
+    let len_ratio = std::cmp::max(p1.len(), p2.len()) as f64 / std::cmp::min(p1.len(), p2.len()) as f64;
+
+    // if strings are similar length, don't use partials
+    if len_ratio < 1.5 {
+        try_partial = false;
+    }
+
+    // if one string is much shorter than the other
+    if len_ratio > 8.0 {
+        partial_scale = 0.6;
+    }
+
+    if try_partial {
+        let partial = partial_ratio(p1r, p2r) as f64 * partial_scale;
+        let ptsor = partial_token_sort_ratio(p1r, p2r, true, false) as f64 * unbase_scale * partial_scale;
+        let ptser = partial_token_set_ratio(p1r, p2r, true, false) as f64 * unbase_scale * partial_scale;
+        // This conversion to u8 from the maximum f64 seems spooky, but let's hope nothing bad happens!
+        return vec![base as f64, partial, ptsor, ptser].iter().cloned().fold(0./0., f64::max).round() as u8;
+    }
+    let tsor = token_sort_ratio(p1r, p2r, true, false) as f64 * unbase_scale;
+    let tser = token_set_ratio(p1r, p2r, true, false) as f64 * unbase_scale;
+    vec![base as f64, tsor, tser].iter().cloned().fold(0./0., f64::max).round() as u8
+}
+
+pub fn uwratio(s1: &str, s2: &str, full_process: bool) -> u8 {
+    wratio(s1, s2, false, full_process)
+}
diff --git a/src/utils.rs b/src/utils.rs
@@ -1,15 +1,18 @@
-use std::ascii::AsciiExt;
-
 pub fn full_process(s: &str, force_ascii: bool) -> String {
     let mut result = s.to_string();
     if force_ascii {
-        result = result.chars().filter(AsciiExt::is_ascii).collect();
+        result = result.chars().filter(|c| c.is_ascii()).collect();
     }
     result = result.chars().map(|c| if c.is_alphanumeric() { c } else { ' ' }).collect();
     result.make_ascii_lowercase();
     result.trim().to_string()
 }
 
+/// Ensures that the input string is non-empty.
+pub fn validate_string(s: &str) -> bool {
+    !s.is_empty()
+}
+
 fn find_longest_match<'a>(shorter: &'a str,
                           longer: &'a str,
                           low1: usize,