Skip to content

Commit

Permalink
Add extract_without_order/extract_one & unit tests
Browse files Browse the repository at this point in the history
Signed-off-by: Sean Pianka <[email protected]>
  • Loading branch information
seanpianka committed Sep 13, 2020
1 parent 92de474 commit f17677b
Show file tree
Hide file tree
Showing 5 changed files with 269 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here http://doc.crates.io/guide.html#cargotoml-vs-cargolock
Cargo.lock

.idea/
150 changes: 150 additions & 0 deletions src/fuzz.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,16 @@ fn token_sort(s1: &str, s2: &str, partial: bool, force_ascii: bool, full_process

/// Return a measure of the sequences' similarity between 0 and 100, but sort the token before
/// comparing.
///
/// By default, force_ascii and full_process should be true.
pub fn token_sort_ratio(s1: &str, s2: &str, force_ascii: bool, full_process: bool) -> u8 {
token_sort(s1, s2, false, force_ascii, full_process)
}

/// Return the ratio of the most similar substring as a number between 0 and 100, but sort the token
/// before comparing.
///
/// By default, force_ascii and full_process should be true.
pub fn partial_token_sort_ratio(s1: &str, s2: &str, force_ascii: bool, full_process: bool) -> u8 {
token_sort(s1, s2, true, force_ascii, full_process)
}
Expand Down Expand Up @@ -220,3 +224,149 @@ pub fn wratio(s1: &str, s2: &str, force_ascii: bool, full_process: bool) -> u8 {
pub fn uwratio(s1: &str, s2: &str, full_process: bool) -> u8 {
wratio(s1, s2, false, full_process)
}

#[cfg(test)]
mod tests {
use fuzz;
use utils;

struct Fixture {
s1: &'static str,
s1a: &'static str,
s2: &'static str,
s3: &'static str,
s4: &'static str,
s5: &'static str,
s6: &'static str,
s7: &'static str,
s8: &'static str,
s8a: &'static str,
s9: &'static str,
s9a: &'static str,
s10: &'static str,
s10a: &'static str,
// TODO: Test silly corner cases,
cirque_strings: &'static [&'static str; 6],
baseball_strings: &'static [&'static str; 4],
}

impl Fixture {
pub fn new() -> Self {
Self {
s1: "new york mets",
s1a: "new york mets",
s2: "new YORK mets",
s3: "the wonderful new york mets",
s4: "new york mets vs atlanta braves",
s5: "atlanta braves vs new york mets",
s6: "new york mets - atlanta braves",
s7: "new york city mets - atlanta braves",
s8: "{",
s8a: "{",
s9: "{a",
s9a: "{a",
s10: "a{",
s10a: "{b",
cirque_strings: &[
"cirque du soleil - zarkana - las vegas",
"cirque du soleil ",
"cirque du soleil las vegas",
"zarkana las vegas",
"las vegas cirque du soleil at the bellagio",
"zarakana - cirque du soleil - bellagio"
],
baseball_strings: &[
"new york mets vs chicago cubs",
"chicago cubs vs chicago white sox",
"philladelphia phillies vs atlanta braves",
"braves vs mets",
]
}
}
}

#[test]
fn test_equal() {
let f = Fixture::new();
assert_eq!(fuzz::ratio(f.s1, f.s1a), 100);
assert_eq!(fuzz::ratio(f.s8, f.s8a), 100);
assert_eq!(fuzz::ratio(f.s9, f.s9a), 100);
}

#[test]
fn test_case_insensitive() {
let f = Fixture::new();
assert_ne!(fuzz::ratio(f.s1, f.s2), 100);
assert_eq!(fuzz::ratio(utils::full_process(f.s1, false).as_str(), utils::full_process(f.s1a, false).as_str()), 100);
}

#[test]
fn test_partial_ratio() {
let f = Fixture::new();
assert_eq!(fuzz::partial_ratio(f.s1, f.s3), 100)
}

#[test]
fn test_token_sort_ratio() {
let f = Fixture::new();
assert_eq!(fuzz::token_sort_ratio(f.s1, f.s1a, true, true), 100)
}

#[test]
fn test_partial_token_sort_ratio() {
let f = Fixture::new();
assert_eq!(fuzz::partial_token_sort_ratio(f.s1, f.s1a, true, true), 100);
assert_eq!(fuzz::partial_token_sort_ratio(f.s4, f.s5, true, true), 100);
assert_eq!(fuzz::partial_token_sort_ratio(f.s8, f.s8a, true, false), 100);
assert_eq!(fuzz::partial_token_sort_ratio(f.s9, f.s9a, true, true), 100);
assert_eq!(fuzz::partial_token_sort_ratio(f.s9, f.s9a, true, false), 100);
assert_eq!(fuzz::partial_token_sort_ratio(f.s10, f.s10a, true, false), 100);
}

#[test]
fn test_token_set_ratio() {
let f = Fixture::new();
assert_eq!(fuzz::token_set_ratio(f.s4, f.s5, true, true), 100);
assert_eq!(fuzz::token_set_ratio(f.s8, f.s8a, true, false), 100);
assert_eq!(fuzz::token_set_ratio(f.s9, f.s9a, true, true), 100);
assert_eq!(fuzz::token_set_ratio(f.s9, f.s9a, true, false), 100);
assert_eq!(fuzz::token_set_ratio(f.s10, f.s10a, true, false), 50);
}

#[test]
fn test_partial_token_set_ratio() {
let f = Fixture::new();
assert_eq!(fuzz::partial_token_set_ratio(f.s4, f.s7, true, true), 100);
}

#[test]
fn test_wratio_equal() {
let f = Fixture::new();
assert_eq!(fuzz::wratio(f.s1, f.s1a, true, true), 100);
}

#[test]
fn test_wratio_case_insensitive() {
let f = Fixture::new();
assert_eq!(fuzz::wratio(f.s1, f.s2, true, true), 100);
}

#[test]
fn test_wratio_partial_match() {
let f = Fixture::new();
assert_eq!(fuzz::wratio(f.s1, f.s3, true, true), 90);
}

#[test]
fn test_wratio_misordered_match() {
let f = Fixture::new();
assert_eq!(fuzz::wratio(f.s4, f.s5, true, true), 50);
}

#[test]
fn test_empty_string_score_100() {
let f = Fixture::new();
assert_eq!(fuzz::ratio("", ""), 100);
assert_eq!(fuzz::partial_ratio("", ""), 100);
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub mod utils;
pub mod fuzz;
pub mod process;

#[cfg(test)]
mod tests {
Expand Down
110 changes: 110 additions & 0 deletions src/process.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/// Select the best match in a list or dictionary of choices.
///
/// Find best matches in a list or dictionary of choices, return a generator of tuples containing
/// the match and its score. If a dictionary is used, also returns the key for each match.
///
/// TODO: Add support for choices as HashMap<&str, &str>, not only as slice &[&str].
pub fn extract_without_order(
query: &str,
choices: &[&str],
processor: &dyn Fn(&str, bool) -> String,
scorer: &dyn Fn(&str, &str, bool, bool) -> u8,
score_cutoff: u8
) -> Vec<(String, u8)> {
if choices.is_empty() {
return vec![];
}

let processed_query: String = processor(query, false);
if processed_query.is_empty() {
println!("Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '{0}']", processed_query.as_str());
}

// TODO: Check if scorer in list of known processor functions to avoid calling utils::full_process multiple times.
// TODO: Only process the query once instead of for every choice.

let mut results = vec![];
for choice in choices {
let processed: String = processor(choice, false);
let score: u8 = scorer(processed_query.as_str(), processed.as_str(), true, true);
if score >= score_cutoff {
results.push((choice.to_string(), score))
}
}
results
}

/// Find the single best match above a score in a list of choices.
///
/// This is a convenience method which returns the single best choice.
///
/// TODO: Add support for choices as HashMap<&str, &str>, not only as slice &[&str].
pub fn extract_one(
query: &str,
choices: &[&str],
processor: &dyn Fn(&str, bool) -> String,
scorer: &dyn Fn(&str, &str, bool, bool) -> u8,
score_cutoff: u8
) -> Option<(String, u8)> {
let best = extract_without_order(query, choices, processor, scorer, score_cutoff);
if best.is_empty() {
return None
}
best.iter().cloned().max_by(|(_, acc_score), (_, score)| {
acc_score.cmp(score)
})
}

#[cfg(test)]
mod tests {
use super::*;
use utils;
use fuzz;

mod process {
use super::*;

fn get_baseball_strings() -> &'static [&'static str] {
&[
"new york mets vs chicago cubs",
"chicago cubs vs chicago white sox",
"philladelphia phillies vs atlanta braves",
"braves vs mets",
]
}

// Call extract_one, unwrap the option, and return 0th element (the choice).
fn unwrap_extract_one_choice(query: &str) -> String {
// Specify sane defaults.
extract_one(query, get_baseball_strings(), &utils::full_process, &fuzz::wratio, 0).unwrap().0
}

#[test]
fn test_get_best_choice1() {
let query = "new york mets at atlanta braves";
let best = unwrap_extract_one_choice(query);
assert_eq!(best.as_str(), get_baseball_strings()[3])
}

#[test]
fn test_get_best_choice2() {
let query = "philadelphia phillies at atlanta braves";
let best = unwrap_extract_one_choice(query);
assert_eq!(best.as_str(), get_baseball_strings()[2])
}

#[test]
fn test_get_best_choice3() {
let query = "atlanta braves at philadelphia phillies";
let best = unwrap_extract_one_choice(query);
assert_eq!(best.as_str(), get_baseball_strings()[2])
}

#[test]
fn test_get_best_choice4() {
let query = "chicago cubs vs new york mets";
let best = unwrap_extract_one_choice(query);
assert_eq!(best.as_str(), get_baseball_strings()[0])
}
}
}
6 changes: 6 additions & 0 deletions src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
/// Process string by
/// # removing all but letters and numbers
/// # trim whitespace
/// # force to lower case
///
/// If force_ascii == true, force convert to ascii. By default, this is false.
pub fn full_process(s: &str, force_ascii: bool) -> String {
let mut result = s.to_string();
if force_ascii {
Expand Down

0 comments on commit f17677b

Please sign in to comment.