Skip to content

Commit 0a23201

Browse files
fmassotfulmicoton
authored andcommitted
Fix stackoverflow and add docs.
1 parent 81330aa commit 0a23201

File tree

3 files changed

+35
-45
lines changed

3 files changed

+35
-45
lines changed

benches/analyzer.rs

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
use criterion::{criterion_group, criterion_main, Criterion};
2-
use tantivy::tokenizer::{TokenizerManager, TextAnalyzer, RemoveLongFilter, LowerCaser, SimpleTokenizer};
2+
use tantivy::tokenizer::{
3+
LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
4+
};
35

46
const ALICE_TXT: &str = include_str!("alice.txt");
57

@@ -16,20 +18,6 @@ pub fn criterion_benchmark(c: &mut Criterion) {
1618
assert_eq!(word_count, 30_731);
1719
})
1820
});
19-
let mut static_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
20-
.filter(RemoveLongFilter::limit(40))
21-
.filter(LowerCaser)
22-
.build();
23-
c.bench_function("static-tokenize-alice", |b| {
24-
b.iter(|| {
25-
let mut word_count = 0;
26-
let mut token_stream = static_analyzer.token_stream(ALICE_TXT);
27-
while token_stream.advance() {
28-
word_count += 1;
29-
}
30-
assert_eq!(word_count, 30_731);
31-
})
32-
});
3321
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
3422
.dynamic()
3523
.filter_dynamic(RemoveLongFilter::limit(40))

src/query/more_like_this/more_like_this.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@ use std::collections::{BinaryHeap, HashMap};
44
use crate::query::bm25::idf;
55
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
66
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
7-
use crate::tokenizer::{
8-
FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
9-
};
7+
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
108
use crate::{DocAddress, Result, Searcher, TantivyError};
119

1210
#[derive(Debug, PartialEq)]
@@ -206,8 +204,7 @@ impl MoreLikeThis {
206204
for value in values {
207205
match value {
208206
Value::PreTokStr(tok_str) => {
209-
let mut token_stream =
210-
PreTokenizedStream::from(tok_str.clone());
207+
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
211208
token_stream.process(&mut |token| {
212209
if !self.is_noise_word(token.text.clone()) {
213210
let term = Term::from_field_text(field, &token.text);

src/tokenizer/tokenizer.rs

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,17 @@ pub struct TextAnalyzer {
1313
impl Tokenizer for Box<dyn BoxableTokenizer> {
1414
type TokenStream<'a> = BoxTokenStream<'a>;
1515

16+
// Note: we want to call `box_token_stream` on the concrete `Tokenizer`
17+
// implementation, not the `BoxableTokenizer` one as it will cause
18+
// a recursive call (and a stack overflow).
1619
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
1720
(**self).box_token_stream(text)
1821
}
1922
}
2023

2124
impl Clone for Box<dyn BoxableTokenizer> {
25+
// Note: we want to call `box_clone` on the concrete `Tokenizer`
26+
// implementation in order to clone the concrete `Tokenizer`.
2227
fn clone(&self) -> Self {
2328
(**self).box_clone()
2429
}
@@ -61,12 +66,12 @@ impl TextAnalyzer {
6166

6267
/// Creates a token stream for a given `str`.
6368
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
64-
self.tokenizer.box_token_stream(text)
69+
self.tokenizer.token_stream(text)
6570
}
6671
}
6772

6873
/// Builder helper for [`TextAnalyzer`]
69-
pub struct TextAnalyzerBuilder<T=Box<dyn BoxableTokenizer>> {
74+
pub struct TextAnalyzerBuilder<T = Box<dyn BoxableTokenizer>> {
7075
tokenizer: T,
7176
}
7277

@@ -90,18 +95,20 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
9095
}
9196
}
9297

93-
/// Boxes the internal tokenizer. This is useful to write generic code.
94-
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
95-
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
96-
/// will be more performant and create less boxes.
98+
/// Boxes the internal tokenizer. This is useful for adding dynamic filters.
99+
/// Note: this will be less performant than the non boxed version.
97100
pub fn dynamic(self) -> TextAnalyzerBuilder {
98101
let boxed_tokenizer = Box::new(self.tokenizer);
99102
TextAnalyzerBuilder {
100-
tokenizer: boxed_tokenizer,
103+
tokenizer: boxed_tokenizer,
101104
}
102105
}
103106

104-
/// Apply a filter and returns a boxed version of the TextAnalyzerBuilder.
107+
/// Appends a token filter to the current builder and returns a boxed version of the
108+
/// tokenizer. This is useful when you want to build a `TextAnalyzer` dynamically.
109+
/// Prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` if
110+
/// possible as it will be more performant and create less boxes.
111+
/// ```
105112
pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
106113
self.filter(token_filter).dynamic()
107114
}
@@ -114,12 +121,11 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
114121
}
115122
}
116123

117-
118124
#[cfg(test)]
119125
mod tests {
120126

121127
use super::*;
122-
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer, SimpleTokenizer};
128+
use crate::tokenizer::{LowerCaser, RemoveLongFilter, SimpleTokenizer};
123129

124130
#[test]
125131
fn test_text_analyzer_builder() {
@@ -133,8 +139,6 @@ mod tests {
133139
assert_eq!(stream.next().unwrap().text, "bullet");
134140
}
135141

136-
137-
138142
#[test]
139143
fn test_text_analyzer_with_filters_boxed() {
140144
// This test shows how one can build a TextAnalyzer dynamically, by stacking a list
@@ -151,19 +155,20 @@ mod tests {
151155
SerializableTokenFilterEnum::LowerCaser(LowerCaser),
152156
SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)),
153157
];
154-
let mut analyzer_builder: TextAnalyzerBuilder = TextAnalyzer::builder(SimpleTokenizer::default())
155-
.filter_dynamic(RemoveLongFilter::limit(40))
156-
.filter_dynamic(LowerCaser);
157-
// for filter in filters {
158-
// analyzer_builder =
159-
// match filter {
160-
// SerializableTokenFilterEnum::LowerCaser(lower_caser) =>
161-
// analyzer_builder.filter_dynamic(lower_caser),
162-
// SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
163-
// analyzer_builder.filter_dynamic(remove_long_filter)
164-
// },
165-
// }
166-
// }
158+
let mut analyzer_builder: TextAnalyzerBuilder =
159+
TextAnalyzer::builder(SimpleTokenizer::default())
160+
.filter_dynamic(RemoveLongFilter::limit(40))
161+
.filter_dynamic(LowerCaser);
162+
for filter in filters {
163+
analyzer_builder = match filter {
164+
SerializableTokenFilterEnum::LowerCaser(lower_caser) => {
165+
analyzer_builder.filter_dynamic(lower_caser)
166+
}
167+
SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
168+
analyzer_builder.filter_dynamic(remove_long_filter)
169+
}
170+
}
171+
}
167172
let mut analyzer = analyzer_builder.build().clone();
168173
let mut stream = analyzer.token_stream("first bullet point");
169174
assert_eq!(stream.next().unwrap().text, "first");

0 commit comments

Comments
 (0)