@@ -13,12 +13,17 @@ pub struct TextAnalyzer {
13
13
impl Tokenizer for Box < dyn BoxableTokenizer > {
14
14
type TokenStream < ' a > = BoxTokenStream < ' a > ;
15
15
16
+ // Note: we want to call `box_token_stream` on the concrete `Tokenizer`
17
+ // implementation, not the `BoxableTokenizer` one as it will cause
18
+ // a recursive call (and a stack overflow).
16
19
fn token_stream < ' a > ( & ' a mut self , text : & ' a str ) -> Self :: TokenStream < ' a > {
17
20
( * * self ) . box_token_stream ( text)
18
21
}
19
22
}
20
23
21
24
impl Clone for Box < dyn BoxableTokenizer > {
25
+ // Note: we want to call `box_clone` on the concrete `Tokenizer`
26
+ // implementation in order to clone the concrete `Tokenizer`.
22
27
fn clone ( & self ) -> Self {
23
28
( * * self ) . box_clone ( )
24
29
}
@@ -61,12 +66,12 @@ impl TextAnalyzer {
61
66
62
67
/// Creates a token stream for a given `str`.
63
68
pub fn token_stream < ' a > ( & ' a mut self , text : & ' a str ) -> BoxTokenStream < ' a > {
64
- self . tokenizer . box_token_stream ( text)
69
+ self . tokenizer . token_stream ( text)
65
70
}
66
71
}
67
72
68
73
/// Builder helper for [`TextAnalyzer`]
69
- pub struct TextAnalyzerBuilder < T = Box < dyn BoxableTokenizer > > {
74
+ pub struct TextAnalyzerBuilder < T = Box < dyn BoxableTokenizer > > {
70
75
tokenizer : T ,
71
76
}
72
77
@@ -90,18 +95,20 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
90
95
}
91
96
}
92
97
93
- /// Boxes the internal tokenizer. This is useful to write generic code.
94
- /// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
95
- /// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
96
- /// will be more performant and create less boxes.
98
+ /// Boxes the internal tokenizer. This is useful for adding dynamic filters.
99
+ /// Note: this will be less performant than the non boxed version.
97
100
pub fn dynamic ( self ) -> TextAnalyzerBuilder {
98
101
let boxed_tokenizer = Box :: new ( self . tokenizer ) ;
99
102
TextAnalyzerBuilder {
100
- tokenizer : boxed_tokenizer,
103
+ tokenizer : boxed_tokenizer,
101
104
}
102
105
}
103
106
104
- /// Apply a filter and returns a boxed version of the TextAnalyzerBuilder.
107
+ /// Appends a token filter to the current builder and returns a boxed version of the
108
+ /// tokenizer. This is useful when you want to build a `TextAnalyzer` dynamically.
109
+ /// Prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` if
110
+ /// possible as it will be more performant and create less boxes.
111
+ /// ```
105
112
pub fn filter_dynamic < F : TokenFilter > ( self , token_filter : F ) -> TextAnalyzerBuilder {
106
113
self . filter ( token_filter) . dynamic ( )
107
114
}
@@ -114,12 +121,11 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
114
121
}
115
122
}
116
123
117
-
118
124
#[ cfg( test) ]
119
125
mod tests {
120
126
121
127
use super :: * ;
122
- use crate :: tokenizer:: { AlphaNumOnlyFilter , LowerCaser , RemoveLongFilter , WhitespaceTokenizer , SimpleTokenizer } ;
128
+ use crate :: tokenizer:: { LowerCaser , RemoveLongFilter , SimpleTokenizer } ;
123
129
124
130
#[ test]
125
131
fn test_text_analyzer_builder ( ) {
@@ -133,8 +139,6 @@ mod tests {
133
139
assert_eq ! ( stream. next( ) . unwrap( ) . text, "bullet" ) ;
134
140
}
135
141
136
-
137
-
138
142
#[ test]
139
143
fn test_text_analyzer_with_filters_boxed ( ) {
140
144
// This test shows how one can build a TextAnalyzer dynamically, by stacking a list
@@ -151,19 +155,20 @@ mod tests {
151
155
SerializableTokenFilterEnum :: LowerCaser ( LowerCaser ) ,
152
156
SerializableTokenFilterEnum :: RemoveLongFilter ( RemoveLongFilter :: limit( 12 ) ) ,
153
157
] ;
154
- let mut analyzer_builder: TextAnalyzerBuilder = TextAnalyzer :: builder ( SimpleTokenizer :: default ( ) )
155
- . filter_dynamic ( RemoveLongFilter :: limit ( 40 ) )
156
- . filter_dynamic ( LowerCaser ) ;
157
- // for filter in filters {
158
- // analyzer_builder =
159
- // match filter {
160
- // SerializableTokenFilterEnum::LowerCaser(lower_caser) =>
161
- // analyzer_builder.filter_dynamic(lower_caser),
162
- // SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
163
- // analyzer_builder.filter_dynamic(remove_long_filter)
164
- // },
165
- // }
166
- // }
158
+ let mut analyzer_builder: TextAnalyzerBuilder =
159
+ TextAnalyzer :: builder ( SimpleTokenizer :: default ( ) )
160
+ . filter_dynamic ( RemoveLongFilter :: limit ( 40 ) )
161
+ . filter_dynamic ( LowerCaser ) ;
162
+ for filter in filters {
163
+ analyzer_builder = match filter {
164
+ SerializableTokenFilterEnum :: LowerCaser ( lower_caser) => {
165
+ analyzer_builder. filter_dynamic ( lower_caser)
166
+ }
167
+ SerializableTokenFilterEnum :: RemoveLongFilter ( remove_long_filter) => {
168
+ analyzer_builder. filter_dynamic ( remove_long_filter)
169
+ }
170
+ }
171
+ }
167
172
let mut analyzer = analyzer_builder. build ( ) . clone ( ) ;
168
173
let mut stream = analyzer. token_stream ( "first bullet point" ) ;
169
174
assert_eq ! ( stream. next( ) . unwrap( ) . text, "first" ) ;
0 commit comments