Skip to content

Commit

Permalink
Fix highlight error: use proper tokenizer (#152)
Browse files Browse the repository at this point in the history
  • Loading branch information
Endle authored Oct 21, 2024
1 parent 3b55444 commit 9967d1b
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 15 deletions.
3 changes: 2 additions & 1 deletion fire_seq_search_server/debug_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ rm -f ./fire_seq_search_server
cargo build --features llm
cp target/debug/fire_seq_search_server ./fire_seq_search_server

export RUST_LOG="warn,fire_seq_search_server=info"
export RUST_LOG="warn,fire_seq_search_server=debug"
#export RUST_LOG="debug"
export RUST_BACKTRACE=1
#RAYON_NUM_THREADS=1
./fire_seq_search_server --notebook_path ~/logseq --enable-journal-query
1 change: 1 addition & 0 deletions fire_seq_search_server/deny.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ ignore = [
# [possible values: any SPDX 3.11 short identifier (+ optional exception)].
allow = [
"MIT", "Apache-2.0",
"Zlib",
"BSD-2-Clause", "BSD-3-Clause",
"CC0-1.0",
"MPL-2.0",
Expand Down
13 changes: 6 additions & 7 deletions fire_seq_search_server/src/post_query/highlighter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,9 @@ impl RenderBlock {
// pub for test
pub fn split_leaf_node_by_terms(&self, terms: &[&str], server_info: &ServerInformation) ->Vec<RenderBlock>{
if terms.is_empty() { return Vec::new(); }
debug!("Highlighting token: {:?}", terms);
debug!("Highlighting token: {:?}, num={}", terms, terms.len());
let r = self.split_leaf_node_by_single_term(terms[0], server_info);
debug!("Split into children {:?}", &r);
if r.is_empty() { return self.split_leaf_node_by_terms(&terms[1..], server_info); }
let mut result = Vec::new();
debug!("We have {} blocks: {:?}", r.len(), &r);
Expand All @@ -200,10 +201,10 @@ impl RenderBlock {
self.check();
if self.is_hit { return ; }
if self.children.is_empty() {
let child = self.split_leaf_node_by_terms(terms, server_info);
debug!("Children list: {:?}", &child);
if !child.is_empty() {
self.children = child;
let children_vec: Vec<RenderBlock> = self.split_leaf_node_by_terms(terms, server_info);
debug!("Children number after split: {}", children_vec.len());
if !children_vec.is_empty() {
self.children = children_vec;
self.text = String::default();
}
}
Expand Down Expand Up @@ -238,9 +239,7 @@ pub fn highlight_keywords_in_body(body: &str, term_tokens: &Vec<String>,
let mut tree_root: RenderBlock = build_tree(body, server_info);
tree_root.parse_highlight(&terms_selected, server_info);
tree_root.flattern();

tree_root.render_to_string()

}

pub fn highlight_sentence_with_keywords(sentence: &str,
Expand Down
2 changes: 1 addition & 1 deletion fire_seq_search_server/src/post_query/hit_parsed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ impl FireSeqSearchHitParsed {

let logseq_uri = generate_uri_v2(&title, server_info);

debug!("Processing a hit, title={}, uri={}", &title, &logseq_uri);
debug!("Processing a hit, title={}, uri={}, summary_len={}", &title, &logseq_uri,summary.len());

let metadata: String = if is_page_hit {
String::from("page_hit")
Expand Down
3 changes: 1 addition & 2 deletions fire_seq_search_server/src/post_query/logseq_uri.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use log::{error,info};
use log::error;
use crate::ServerInformation;
use url::Url;

Expand Down Expand Up @@ -154,7 +154,6 @@ fn parse_slice_to_u8(slice: Option<&str>) -> Option<u32> {

pub fn parse_date_from_str(title: &str) -> Option<JournalDate> {
if title.len() != 10 {
info!("Journal length unexpected: {}", title);
return None;
}

Expand Down
8 changes: 4 additions & 4 deletions fire_seq_search_server/src/post_query/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use log::info;
use crate::query_engine::ServerInformation;
use crate::tokenize_default;
use crate::language_tools::tokenizer::tokenize;

pub mod logseq_uri;
pub mod highlighter;
Expand All @@ -15,8 +15,8 @@ pub fn post_query_wrapper(top_docs: Vec<(f32, tantivy::DocAddress)>,
term: &str,
searcher: &tantivy::Searcher,
server_info: &ServerInformation) -> Vec<String> {
let term_tokens = tokenize_default(term);
info!("get term tokens {:?}", &term_tokens);
let term_tokens = tokenize(term);
info!("get term tokens({}) {:?}", term_tokens.len(), &term_tokens);
let result: Vec<String> = top_docs.par_iter()
.map(|x| parse_and_serde(x, searcher, &term_tokens, server_info))
.collect();
Expand All @@ -33,6 +33,6 @@ fn parse_and_serde(x: &(f32, tantivy::DocAddress),
let hit_parsed = FireSeqSearchHitParsed::from_tantivy(
&doc, score, term_tokens, server_info
); // it also provides the highlight

hit_parsed.serde_to_string()
}

0 comments on commit 9967d1b

Please sign in to comment.