Skip to content

Commit

Permalink
Fix highlight for Cyrillic letters
Browse files Browse the repository at this point in the history
Merge pull request #62 from Endle/Cyrillic
Not tested #59
  • Loading branch information
Endle authored Nov 12, 2022
1 parent 80b7162 commit 172acfb
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 17 deletions.
2 changes: 1 addition & 1 deletion fire_seq_search_server/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fire_seq_search_server"
version = "0.0.17"
version = "0.0.19"
edition = "2021"


Expand Down
39 changes: 27 additions & 12 deletions fire_seq_search_server/src/post_query/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use log::error;
use stopwords;
use regex::RegexBuilder;

pub fn highlight_keywords_in_body(body: &str, term_tokens: &Vec<String>) -> String {

Expand Down Expand Up @@ -46,21 +48,14 @@ pub fn recursive_wrap(sentence: &str, term_tokens: &[String]) -> String {
let span_start = "<span class=\"fireSeqSearchHighlight\">";
let span_end = "</span>";
let token = &term_tokens[0];
if !sentence.contains(token) {
let lower_token = token.to_ascii_lowercase();
let lower_sentence = sentence.to_ascii_lowercase();
if lower_sentence.contains(&lower_token) {
//FIXME This is a hack for English words
let mut new_terms = Vec::from(term_tokens);
new_terms[0] = lower_token;
return recursive_wrap(&lower_sentence, &new_terms);
}


let segments = split_by_single_token(sentence, token);
// Found nothing for this token
if segments.len() <= 1 {
return recursive_wrap(sentence, &term_tokens[1..]);
}

let mut result = Vec::new();
for seg in sentence.split(token) {
for seg in segments {
let r = recursive_wrap(seg, &term_tokens[1..]);
result.push(r);
}
Expand All @@ -69,6 +64,26 @@ pub fn recursive_wrap(sentence: &str, term_tokens: &[String]) -> String {
result.join(&wrapped)
}

pub fn split_by_single_token<'a>(sentence: &'a str, token: &'a str) -> Vec<&'a str> {
let mut result = Vec::new();
let needle = RegexBuilder::new(token)
.case_insensitive(true)
.build();
let needle = match needle {
Ok(x) => x,
Err(e) => {
error!("Failed({}) to build regex for {}", e, token);
return result;
}
};
let segs: Vec<&str> = needle.split(sentence).collect();
for seg in segs {
result.push(seg);
}
result
}


// TODO: current implementation is too naive, I believe it is buggy
pub fn split_body_to_blocks(body: &str) -> Vec<String> {
let mut result = Vec::new();
Expand Down
1 change: 1 addition & 0 deletions fire_seq_search_server/tests/resource/pages/cyrillic.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
- Это статья для тестов поиска в кириллических символах.
2 changes: 1 addition & 1 deletion fire_seq_search_server/tests/unit_test_load_notes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use fire_seq_search_server::markdown_parser::parse_to_plain_text;
#[test]
fn load_articles() {
let r = read_specific_directory("tests/resource/pages");
assert_eq!(r.len(), 7);
assert_eq!(r.len(), 8);
for (title,body) in &r{
assert!(title.len()>0);
assert!(body.len()>0);
Expand Down
38 changes: 37 additions & 1 deletion fire_seq_search_server/tests/unit_test_post_query.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

use fire_seq_search_server::post_query::{highlight_keywords_in_body, split_body_to_blocks};
use fire_seq_search_server::post_query::{highlight_keywords_in_body, recursive_wrap, split_body_to_blocks, split_by_single_token};

fn get_english_text() -> String {
std::fs::read_to_string("tests/resource/pages/International Language, Past, Present & Future by Walter John Clark.md")
Expand Down Expand Up @@ -43,4 +43,40 @@ fn test_split_to_block() {
assert_eq!("As an ounce of personal experience is worth a pound of second-hand recital, a brief statement may here be given of the way in which the present writer came to take up Esperanto, and of the experiences which soon led him to the conviction of its absolute practicability and utility.", &blocks[0]);
assert_eq!("Now, quite apart from the obvious fact that the nations will never agree to give the preference to the language of one of them to the prejudice of the others, this argument involves the 16 suggestion that an artificial language is no easier to learn than a natural one. We thus come to the question of ease as a qualification.", &blocks[12]);
assert_eq!(14, blocks.len());
}

#[test]
fn test_split_by_single_token() {
// not stabled yet
let r = split_by_single_token("As an ounce of personal experience is worth a pound of", "personal");
assert_eq!(r.len(), 2);

let r = split_by_single_token("no such", "exist");
assert_eq!(r.len(), 1);

let r = split_by_single_token("母猪都能上树", "上");
assert_eq!(r.len(), 2);

let r = split_by_single_token("Это статья для тестов поиска в кириллических символах", "для");
assert_eq!(r.len(), 2);

let r = split_by_single_token("head is match", "head");
assert_eq!(r.len(), 2);
// println!("{:?}", &r);
}


#[test]
fn test_recursive_wrap_unstable() {
let r = recursive_wrap("head is match", &gen(vec!["head"]));
assert!(r.contains("fireSeqSearchHighlight"));
// println!("{:?}", &r);
}

fn gen(s: Vec<&str>) -> Vec<String> {
let mut r = Vec::with_capacity(s.len());
for i in s {
r.push(String::from(i));
}
r
}
4 changes: 2 additions & 2 deletions pack_firefox_extension.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cd fireSeqSearch_addon
zip -r -FS ../fireSeqSearch.zip * --exclude '*.git*' --exclude "manifest_chrome.json"
zip -r -FS ../fireSeqSearch.zip * --exclude '*.git*' --exclude "monkeyscript.user.js"
cd ..
cp -f fireSeqSearch.zip /dev/shm
cp -f fireSeqSearch.zip /dev/shm

0 comments on commit 172acfb

Please sign in to comment.