From f17677b08c064b2ef3cb7e1d570c2cb67f0e4bd1 Mon Sep 17 00:00:00 2001 From: Sean Pianka Date: Sat, 12 Sep 2020 19:03:00 -0400 Subject: [PATCH] Add extract_without_order/extract_one & unit tests Signed-off-by: Sean Pianka --- .gitignore | 2 + src/fuzz.rs | 150 +++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/process.rs | 110 ++++++++++++++++++++++++++++++++++++ src/utils.rs | 6 ++ 5 files changed, 269 insertions(+) diff --git a/.gitignore b/.gitignore index cb14a42..5737ae5 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries # More information here http://doc.crates.io/guide.html#cargotoml-vs-cargolock Cargo.lock + +.idea/ diff --git a/src/fuzz.rs b/src/fuzz.rs index 9ddbc8b..07e9661 100644 --- a/src/fuzz.rs +++ b/src/fuzz.rs @@ -72,12 +72,16 @@ fn token_sort(s1: &str, s2: &str, partial: bool, force_ascii: bool, full_process /// Return a measure of the sequences' similarity between 0 and 100, but sort the token before /// comparing. +/// +/// By default, force_ascii and full_process should be true. pub fn token_sort_ratio(s1: &str, s2: &str, force_ascii: bool, full_process: bool) -> u8 { token_sort(s1, s2, false, force_ascii, full_process) } /// Return the ratio of the most similar substring as a number between 0 and 100, but sort the token /// before comparing. +/// +/// By default, force_ascii and full_process should be true. pub fn partial_token_sort_ratio(s1: &str, s2: &str, force_ascii: bool, full_process: bool) -> u8 { token_sort(s1, s2, true, force_ascii, full_process) } @@ -220,3 +224,149 @@ pub fn wratio(s1: &str, s2: &str, force_ascii: bool, full_process: bool) -> u8 { pub fn uwratio(s1: &str, s2: &str, full_process: bool) -> u8 { wratio(s1, s2, false, full_process) } + +#[cfg(test)] +mod tests { + use fuzz; + use utils; + + struct Fixture { + s1: &'static str, + s1a: &'static str, + s2: &'static str, + s3: &'static str, + s4: &'static str, + s5: &'static str, + s6: &'static str, + s7: &'static str, + s8: &'static str, + s8a: &'static str, + s9: &'static str, + s9a: &'static str, + s10: &'static str, + s10a: &'static str, + // TODO: Test silly corner cases, + cirque_strings: &'static [&'static str; 6], + baseball_strings: &'static [&'static str; 4], + } + + impl Fixture { + pub fn new() -> Self { + Self { + s1: "new york mets", + s1a: "new york mets", + s2: "new YORK mets", + s3: "the wonderful new york mets", + s4: "new york mets vs atlanta braves", + s5: "atlanta braves vs new york mets", + s6: "new york mets - atlanta braves", + s7: "new york city mets - atlanta braves", + s8: "{", + s8a: "{", + s9: "{a", + s9a: "{a", + s10: "a{", + s10a: "{b", + cirque_strings: &[ + "cirque du soleil - zarkana - las vegas", + "cirque du soleil ", + "cirque du soleil las vegas", + "zarkana las vegas", + "las vegas cirque du soleil at the bellagio", + "zarakana - cirque du soleil - bellagio" + ], + baseball_strings: &[ + "new york mets vs chicago cubs", + "chicago cubs vs chicago white sox", + "philladelphia phillies vs atlanta braves", + "braves vs mets", + ] + } + } + } + + #[test] + fn test_equal() { + let f = Fixture::new(); + assert_eq!(fuzz::ratio(f.s1, f.s1a), 100); + assert_eq!(fuzz::ratio(f.s8, f.s8a), 100); + assert_eq!(fuzz::ratio(f.s9, f.s9a), 100); + } + + #[test] + fn test_case_insensitive() { + let f = Fixture::new(); + assert_ne!(fuzz::ratio(f.s1, f.s2), 100); + assert_eq!(fuzz::ratio(utils::full_process(f.s1, false).as_str(), utils::full_process(f.s1a, false).as_str()), 100); + } + + #[test] + fn test_partial_ratio() { + let f = Fixture::new(); + assert_eq!(fuzz::partial_ratio(f.s1, f.s3), 100) + } + + #[test] + fn test_token_sort_ratio() { + let f = Fixture::new(); + assert_eq!(fuzz::token_sort_ratio(f.s1, f.s1a, true, true), 100) + } + + #[test] + fn test_partial_token_sort_ratio() { + let f = Fixture::new(); + assert_eq!(fuzz::partial_token_sort_ratio(f.s1, f.s1a, true, true), 100); + assert_eq!(fuzz::partial_token_sort_ratio(f.s4, f.s5, true, true), 100); + assert_eq!(fuzz::partial_token_sort_ratio(f.s8, f.s8a, true, false), 100); + assert_eq!(fuzz::partial_token_sort_ratio(f.s9, f.s9a, true, true), 100); + assert_eq!(fuzz::partial_token_sort_ratio(f.s9, f.s9a, true, false), 100); + assert_eq!(fuzz::partial_token_sort_ratio(f.s10, f.s10a, true, false), 100); + } + + #[test] + fn test_token_set_ratio() { + let f = Fixture::new(); + assert_eq!(fuzz::token_set_ratio(f.s4, f.s5, true, true), 100); + assert_eq!(fuzz::token_set_ratio(f.s8, f.s8a, true, false), 100); + assert_eq!(fuzz::token_set_ratio(f.s9, f.s9a, true, true), 100); + assert_eq!(fuzz::token_set_ratio(f.s9, f.s9a, true, false), 100); + assert_eq!(fuzz::token_set_ratio(f.s10, f.s10a, true, false), 50); + } + + #[test] + fn test_partial_token_set_ratio() { + let f = Fixture::new(); + assert_eq!(fuzz::partial_token_set_ratio(f.s4, f.s7, true, true), 100); + } + + #[test] + fn test_wratio_equal() { + let f = Fixture::new(); + assert_eq!(fuzz::wratio(f.s1, f.s1a, true, true), 100); + } + + #[test] + fn test_wratio_case_insensitive() { + let f = Fixture::new(); + assert_eq!(fuzz::wratio(f.s1, f.s2, true, true), 100); + } + + #[test] + fn test_wratio_partial_match() { + let f = Fixture::new(); + assert_eq!(fuzz::wratio(f.s1, f.s3, true, true), 90); + } + + #[test] + fn test_wratio_misordered_match() { + let f = Fixture::new(); + assert_eq!(fuzz::wratio(f.s4, f.s5, true, true), 50); + } + + #[test] + fn test_empty_string_score_100() { + let f = Fixture::new(); + assert_eq!(fuzz::ratio("", ""), 100); + assert_eq!(fuzz::partial_ratio("", ""), 100); + } +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index e8ff9f8..17104bf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ pub mod utils; pub mod fuzz; +pub mod process; #[cfg(test)] mod tests { diff --git a/src/process.rs b/src/process.rs index e69de29..ab97827 100644 --- a/src/process.rs +++ b/src/process.rs @@ -0,0 +1,110 @@ +/// Select the best match in a list or dictionary of choices. +/// +/// Find best matches in a list or dictionary of choices, return a generator of tuples containing +/// the match and its score. If a dictionary is used, also returns the key for each match. +/// +/// TODO: Add support for choices as HashMap<&str, &str>, not only as slice &[&str]. +pub fn extract_without_order( + query: &str, + choices: &[&str], + processor: &dyn Fn(&str, bool) -> String, + scorer: &dyn Fn(&str, &str, bool, bool) -> u8, + score_cutoff: u8 +) -> Vec<(String, u8)> { + if choices.is_empty() { + return vec![]; + } + + let processed_query: String = processor(query, false); + if processed_query.is_empty() { + println!("Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '{0}']", processed_query.as_str()); + } + + // TODO: Check if scorer in list of known processor functions to avoid calling utils::full_process multiple times. + // TODO: Only process the query once instead of for every choice. + + let mut results = vec![]; + for choice in choices { + let processed: String = processor(choice, false); + let score: u8 = scorer(processed_query.as_str(), processed.as_str(), true, true); + if score >= score_cutoff { + results.push((choice.to_string(), score)) + } + } + results +} + +/// Find the single best match above a score in a list of choices. +/// +/// This is a convenience method which returns the single best choice. +/// +/// TODO: Add support for choices as HashMap<&str, &str>, not only as slice &[&str]. +pub fn extract_one( + query: &str, + choices: &[&str], + processor: &dyn Fn(&str, bool) -> String, + scorer: &dyn Fn(&str, &str, bool, bool) -> u8, + score_cutoff: u8 +) -> Option<(String, u8)> { + let best = extract_without_order(query, choices, processor, scorer, score_cutoff); + if best.is_empty() { + return None + } + best.iter().cloned().max_by(|(_, acc_score), (_, score)| { + acc_score.cmp(score) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use utils; + use fuzz; + + mod process { + use super::*; + + fn get_baseball_strings() -> &'static [&'static str] { + &[ + "new york mets vs chicago cubs", + "chicago cubs vs chicago white sox", + "philladelphia phillies vs atlanta braves", + "braves vs mets", + ] + } + + // Call extract_one, unwrap the option, and return 0th element (the choice). + fn unwrap_extract_one_choice(query: &str) -> String { + // Specify sane defaults. + extract_one(query, get_baseball_strings(), &utils::full_process, &fuzz::wratio, 0).unwrap().0 + } + + #[test] + fn test_get_best_choice1() { + let query = "new york mets at atlanta braves"; + let best = unwrap_extract_one_choice(query); + assert_eq!(best.as_str(), get_baseball_strings()[3]) + } + + #[test] + fn test_get_best_choice2() { + let query = "philadelphia phillies at atlanta braves"; + let best = unwrap_extract_one_choice(query); + assert_eq!(best.as_str(), get_baseball_strings()[2]) + } + + #[test] + fn test_get_best_choice3() { + let query = "atlanta braves at philadelphia phillies"; + let best = unwrap_extract_one_choice(query); + assert_eq!(best.as_str(), get_baseball_strings()[2]) + } + + #[test] + fn test_get_best_choice4() { + let query = "chicago cubs vs new york mets"; + let best = unwrap_extract_one_choice(query); + assert_eq!(best.as_str(), get_baseball_strings()[0]) + } + } +} \ No newline at end of file diff --git a/src/utils.rs b/src/utils.rs index de1482c..ff3d3db 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,3 +1,9 @@ +/// Process string by +/// # removing all but letters and numbers +/// # trim whitespace +/// # force to lower case +/// +/// If force_ascii == true, force convert to ascii. By default, this is false. pub fn full_process(s: &str, force_ascii: bool) -> String { let mut result = s.to_string(); if force_ascii {