From e42179ece689fd30039458c91a2803d56e87ca6a Mon Sep 17 00:00:00 2001 From: edu Date: Mon, 24 Apr 2017 16:07:13 +0200 Subject: [PATCH 01/40] Added tokenizers --- .../schema/analysis/AnalyzerBuilder.java | 3 +- .../schema/analysis/CustomAnalyzer.java | 30 ++ .../analysis/CustomAnalyzerBuilder.java | 52 ++++ .../tokenizer/ClassicTokenizerBuilder.java | 52 ++++ .../tokenizer/KeywordTokenizerBuilder.java | 50 ++++ .../tokenizer/LetterTokenizerBuilder.java | 32 +++ .../tokenizer/NGramTokenizerBuilder.java | 58 ++++ .../PathHierarchyTokenizerBuilder.java | 74 +++++ .../tokenizer/PatternTokenizerBuilder.java | 68 +++++ .../ReversePathHierarchyTokenizerBuilder.java | 74 +++++ .../tokenizer/StandardTokenizerBuilder.java | 52 ++++ .../analysis/tokenizer/TokenizerBuilder.java | 60 ++++ .../UAX29URLEmailTokenizerBuilder.java | 53 ++++ .../UnicodeWhitespaceTokenizerBuilder.java | 32 +++ .../tokenizer/WhitespaceTokenizerBuilder.java | 32 +++ .../tokenizer/WikipediaTokenizerBuilder.java | 96 +++++++ .../tokenizer/TokenizerBuilderTest.java | 262 ++++++++++++++++++ 17 files changed, 1079 insertions(+), 1 deletion(-) create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java create mode 100644 plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/AnalyzerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/AnalyzerBuilder.java index 87e89c582..ed8ff267a 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/AnalyzerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/AnalyzerBuilder.java @@ -26,7 +26,8 @@ */ @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type") @JsonSubTypes({@JsonSubTypes.Type(value = ClasspathAnalyzerBuilder.class, name = "classpath"), - @JsonSubTypes.Type(value = SnowballAnalyzerBuilder.class, name = "snowball")}) + @JsonSubTypes.Type(value = SnowballAnalyzerBuilder.class, name = "snowball"), + @JsonSubTypes.Type(value = CustomAnalyzerBuilder.class, name = "custom")}) public abstract class AnalyzerBuilder { /** diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java new file mode 100644 index 000000000..647b0f213 --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java @@ -0,0 +1,30 @@ +package com.stratio.cassandra.lucene.schema.analysis; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * @author Eduardo Alonso {@literal } + */ +public class CustomAnalyzer extends Analyzer { + + final Tokenizer tokenizer; + + + + public CustomAnalyzer(Tokenizer tokenizer) { + this.tokenizer= tokenizer; + + } + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + + TokenStream ts = tokenizer; + + return new TokenStreamComponents(tokenizer, ts); + } + + +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzerBuilder.java new file mode 100644 index 000000000..9bee8b3ae --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzerBuilder.java @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis; + +/** + * @author Eduardo Alonso {@literal } + */ + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.stratio.cassandra.lucene.schema.analysis.tokenizer.TokenizerBuilder; +import org.apache.lucene.analysis.Analyzer; + +/** + * {@link AnalyzerBuilder} for building {@link Analyzer}s based on an advanced configuration. + */ +public class CustomAnalyzerBuilder extends AnalyzerBuilder { + + @JsonProperty("tokenizer") + private final TokenizerBuilder tokenizer; + + /** + * Builds a new {@link AnalyzerBuilder} using custom tokenizer, char_filters and token_filters. + * + * @param tokenizer an {@link TokenizerBuilder} the tookenizer to use. + */ + @JsonCreator + public CustomAnalyzerBuilder(@JsonProperty("tokenizer") TokenizerBuilder tokenizer) { + this.tokenizer = tokenizer; + } + + /** {@inheritDoc} */ + @Override + public Analyzer analyzer() { + return new CustomAnalyzer(tokenizer.buildTokenizer()); + } +} + + diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.java new file mode 100644 index 000000000..ebf5855c4 --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.java @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.lucene.analysis.standard.ClassicTokenizer; + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.standard.ClassicTokenizer} + * + * @author Eduardo Alonso {@literal } + */ +public class ClassicTokenizerBuilder extends TokenizerBuilder { + + static final Integer DEFAULT_MAX_TOKEN_LENGTH = 255; + + /** If a token length is bigger that this, token is split at max token length intervals. */ + @JsonProperty("max_token_length") + final Integer maxTokenLength; + + /** + * Builds a new {@link ClassicTokenizerBuilder} using the specified maxTokenLength. + * + * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. + */ + @JsonCreator + public ClassicTokenizerBuilder(@JsonProperty("max_token_length") Integer maxTokenLength) { + this.maxTokenLength = getOrDefault(maxTokenLength, DEFAULT_MAX_TOKEN_LENGTH); + } + + /** {@inheritDoc} */ + @Override + public ClassicTokenizer buildTokenizer() { + ClassicTokenizer tokenizer = new ClassicTokenizer(); + tokenizer.setMaxTokenLength(maxTokenLength); + return tokenizer; + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.java new file mode 100644 index 000000000..687ba9c2b --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.java @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.lucene.analysis.core.KeywordTokenizer; + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.core.KeywordTokenizer} + * + * @author Eduardo Alonso {@literal } + */ +public class KeywordTokenizerBuilder extends TokenizerBuilder { + + static final Integer DEFAULT_BUFFER_SIZE = 256; + + /** terms cache read buffer size */ + @JsonProperty("buffer_size") + final Integer bufferSize; + + /** + * Builds a new {@link KeywordTokenizerBuilder} using the specified buffer_size. + * + * @param bufferSize the terms cache read buffer size + */ + @JsonCreator + public KeywordTokenizerBuilder(@JsonProperty("buffer_size") Integer bufferSize) { + this.bufferSize = getOrDefault(bufferSize, DEFAULT_BUFFER_SIZE); + } + + /** {@inheritDoc} */ + @Override + public KeywordTokenizer buildTokenizer() { + return new KeywordTokenizer(bufferSize); + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.java new file mode 100644 index 000000000..e16f85313 --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.java @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import org.apache.lucene.analysis.core.LetterTokenizer; + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.core.LetterTokenizer} + * + * @author Eduardo Alonso {@literal } + */ +public class LetterTokenizerBuilder extends TokenizerBuilder { + + /** {@inheritDoc} */ + @Override + public LetterTokenizer buildTokenizer() { + return new LetterTokenizer(); + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.java new file mode 100644 index 000000000..4da9dd300 --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.java @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.lucene.analysis.ngram.NGramTokenizer; + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.ngram.NGramTokenizer} + * + * @author Eduardo Alonso {@literal } + */ +public class NGramTokenizerBuilder extends TokenizerBuilder { + + static final Integer DEFAULT_MIN_GRAM = 1; + static final Integer DEFAULT_MAX_GRAM = 2; + + /** the smallest n-gram to generate */ + @JsonProperty("min_gram") + final Integer minGram; + + /** the largest n-gram to generate */ + @JsonProperty("max_gram") + final Integer maxGram; + + /** + * Builds a new {@link NGramTokenizerBuilder} using the specified minGram and manGram. + * + * @param minGram the smallest n-gram to generate + * @param minGram the largest n-gram to generate + */ + @JsonCreator + public NGramTokenizerBuilder(@JsonProperty("min_gram") Integer minGram, + @JsonProperty("max_gram") Integer maxGram) { + this.minGram = getOrDefault(minGram, DEFAULT_MIN_GRAM); + this.maxGram = getOrDefault(maxGram, DEFAULT_MAX_GRAM); + } + + /** {@inheritDoc} */ + @Override + public NGramTokenizer buildTokenizer() { + return new NGramTokenizer(minGram, maxGram); + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java new file mode 100644 index 000000000..33b0eac58 --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.lucene.analysis.path.PathHierarchyTokenizer; + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.path.PathHierarchyTokenizer} + * + * @author Eduardo Alonso {@literal } + */ +public class PathHierarchyTokenizerBuilder extends TokenizerBuilder { + + static final Integer DEFAULT_BUFFER_SIZE = 1024; + static final Character DEFAULT_DELIMITER = '/'; + static final Character DEFAULT_REPLACEMENT = '/'; + static final Integer DEFAULT_SKIP = 0; + + /** terms cache read buffer size */ + @JsonProperty("buffer_size") + final Integer bufferSize; + + /** path separator */ + @JsonProperty("delimiter") + final Character delimiter; + + /** a replacement character for delimiter */ + @JsonProperty("replacement") + final Character replacement; + + /** number of initial tokens to skip */ + @JsonProperty("skip") + final Integer skip; + + /** + * Builds a new {@link PathHierarchyTokenizerBuilder} using the specified bufferSize, delimiter, replacement and skip. + * + * @param bufferSize terms cache read buffer size + * @param delimiter path separator + * @param replacement a replacement character for delimiter + * @param skip number of initial tokens to skip + */ + @JsonCreator + public PathHierarchyTokenizerBuilder(@JsonProperty("buffer_size") Integer bufferSize, + @JsonProperty("delimiter") Character delimiter, + @JsonProperty("replacement") Character replacement, + @JsonProperty("skip") Integer skip) { + this.bufferSize = getOrDefault(bufferSize, DEFAULT_BUFFER_SIZE); + this.delimiter = getOrDefault(delimiter, DEFAULT_DELIMITER); + this.replacement = getOrDefault(replacement, DEFAULT_REPLACEMENT); + this.skip = getOrDefault(skip, DEFAULT_SKIP); + } + + /** {@inheritDoc} */ + @Override + public PathHierarchyTokenizer buildTokenizer() { + return new PathHierarchyTokenizer(bufferSize, delimiter, replacement, skip); + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.java new file mode 100644 index 000000000..ca650a96f --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.java @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.lucene.analysis.pattern.PatternTokenizer; + +import java.util.regex.Pattern; + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.pattern.PatternTokenizer} + * + * @author Eduardo Alonso {@literal } + */ +public class PatternTokenizerBuilder extends TokenizerBuilder { + + static final String DEFAULT_PATTERN = "\\W+"; + static final Integer DEFAULT_FLAGS = 0; + static final Integer DEFAULT_GROUP = -1; + + /** java regular expression */ + @JsonProperty("pattern") + final String pattern; + + /** java regular expression flags */ + @JsonProperty("flags") + final Integer flags; + + /** which pattern group to use to generate tokens (-1 for split) */ + @JsonProperty("group") + final Integer group; + + /** + * Builds a new {@link PatternTokenizerBuilder} using the specified pattern, flags, and group. + * + * @param pattern java regular expression + * @param flags java regular expression flags + * @param group a pattern group to use to generate tokens (-1 for split) + */ + @JsonCreator + public PatternTokenizerBuilder(@JsonProperty("pattern") String pattern, + @JsonProperty("flags") Integer flags, + @JsonProperty("group") Integer group) { + this.pattern = getOrDefault(pattern, DEFAULT_PATTERN); + this.flags = getOrDefault(flags, DEFAULT_FLAGS); + this.group = getOrDefault(group, DEFAULT_GROUP); + } + + /** {@inheritDoc} */ + @Override + public PatternTokenizer buildTokenizer() { + return new PatternTokenizer(Pattern.compile(pattern, flags), group); + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.java new file mode 100644 index 000000000..659272e9f --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.java @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer} + * + * @author Eduardo Alonso {@literal } + */ +public class ReversePathHierarchyTokenizerBuilder extends TokenizerBuilder { + + static final Integer DEFAULT_BUFFER_SIZE = 1024; + static final Character DEFAULT_DELIMITER = '/'; + static final Character DEFAULT_REPLACEMENT = '/'; + static final Integer DEFAULT_SKIP = 0; + + /** terms cache read buffer size */ + @JsonProperty("buffer_size") + final Integer bufferSize; + + /** path separator */ + @JsonProperty("delimiter") + final Character delimiter; + + /** a replacement character for delimiter */ + @JsonProperty("replacement") + final Character replacement; + + /** number of initial tokens to skip */ + @JsonProperty("skip") + final Integer skip; + + /** + * Builds a new {@link ReversePathHierarchyTokenizerBuilder} using the specified bufferSize, delimiter, replacement and skip. + * + * @param bufferSize terms cache read buffer size + * @param delimiter path separator + * @param replacement a replacement character for delimiter + * @param skip number of initial tokens to skip + */ + @JsonCreator + public ReversePathHierarchyTokenizerBuilder(@JsonProperty("buffer_size") Integer bufferSize, + @JsonProperty("delimiter") Character delimiter, + @JsonProperty("replacement") Character replacement, + @JsonProperty("skip") Integer skip) { + this.bufferSize = getOrDefault(bufferSize, DEFAULT_BUFFER_SIZE); + this.delimiter = getOrDefault(delimiter, DEFAULT_DELIMITER); + this.replacement = getOrDefault(replacement, DEFAULT_REPLACEMENT); + this.skip = getOrDefault(skip, DEFAULT_SKIP); + } + + /** {@inheritDoc} */ + @Override + public ReversePathHierarchyTokenizer buildTokenizer() { + return new ReversePathHierarchyTokenizer(bufferSize, delimiter, replacement, skip); + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.java new file mode 100644 index 000000000..5faa1f480 --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.java @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.standard.StandardTokenizer} + * + * @author Eduardo Alonso {@literal } + */ +public class StandardTokenizerBuilder extends TokenizerBuilder { + + static final Integer DEFAULT_MAX_TOKEN_LENGTH = 255; + + /** If a token length is bigger that this, token is split at max token length intervals. */ + @JsonProperty("max_token_length") + final Integer maxTokenLength; + + /** + * Builds a new {@link StandardTokenizerBuilder} using the specified bufferSize, delimiter, replacement and skip. + * + * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. + */ + @JsonCreator + public StandardTokenizerBuilder(@JsonProperty("max_token_length") Integer maxTokenLength) { + this.maxTokenLength = getOrDefault(maxTokenLength, DEFAULT_MAX_TOKEN_LENGTH); + } + + /** {@inheritDoc} */ + @Override + public StandardTokenizer buildTokenizer() { + StandardTokenizer tokenizer = new StandardTokenizer(); + tokenizer.setMaxTokenLength(maxTokenLength); + return tokenizer; + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java new file mode 100644 index 000000000..e7146da68 --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import org.apache.lucene.analysis.Tokenizer; + +/** + * @author Eduardo Alonso {@literal } + */ +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type") +@JsonSubTypes({@JsonSubTypes.Type(value = ClassicTokenizerBuilder.class, name = "classic"), + @JsonSubTypes.Type(value = KeywordTokenizerBuilder.class, name = "keyword"), + @JsonSubTypes.Type(value = LetterTokenizerBuilder.class, name = "letter"), + @JsonSubTypes.Type(value = NGramTokenizerBuilder.class, name = "ngram"), + @JsonSubTypes.Type(value = PathHierarchyTokenizerBuilder.class, name = "path_hierarchy"), + @JsonSubTypes.Type(value = PatternTokenizerBuilder.class, name = "pattern"), + @JsonSubTypes.Type(value = ReversePathHierarchyTokenizerBuilder.class, name = "reverse_path_hierarchy"), + @JsonSubTypes.Type(value = StandardTokenizerBuilder.class, name = "standard"), + @JsonSubTypes.Type(value = UAX29URLEmailTokenizerBuilder.class, name = "uax29_url_email"), + @JsonSubTypes.Type(value = UnicodeWhitespaceTokenizerBuilder.class, name = "unicode_whitespace"), + @JsonSubTypes.Type(value = WhitespaceTokenizerBuilder.class, name = "whitespace"), + @JsonSubTypes.Type(value = WikipediaTokenizerBuilder.class, name = "wikipedia")}) +public abstract class TokenizerBuilder { + + /** + * Gets or creates the Lucene {@link Tokenizer}. + * + * @return the built analyzer + */ + public abstract T buildTokenizer(); + + /** + * @param param the main parameter. + * @param defaultParam the default parameter if main paramaeter is null. + * @param return type must extend {@link Tokenizer} + * @return if (param!=null) { return param; }else{ return defaultParam; } + */ + public static T getOrDefault(T param, T defaultParam) { + if (param==null) { + return defaultParam; + } else { + return param; + } + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java new file mode 100644 index 000000000..35ad21e63 --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer; + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer} + * + * @author Eduardo Alonso {@literal } + */ +public class UAX29URLEmailTokenizerBuilder extends TokenizerBuilder { + + static final Integer DEFAULT_MAX_TOKEN_LENGTH = 255; + + /** If a token length is bigger that this, token is split at max token length intervals. */ + @JsonProperty("max_token_length") + final Integer maxTokenLength; + + /** + * Builds a new {@link UAX29URLEmailTokenizerBuilder} using the specified maxTokenLength. + * + * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. + */ + @JsonCreator + public UAX29URLEmailTokenizerBuilder(@JsonProperty("max_token_length") Integer maxTokenLength) { + this.maxTokenLength = getOrDefault(maxTokenLength,DEFAULT_MAX_TOKEN_LENGTH); + + } + + /** {@inheritDoc} */ + @Override + public UAX29URLEmailTokenizer buildTokenizer() { + UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(); + tokenizer.setMaxTokenLength(maxTokenLength); + return tokenizer; + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.java new file mode 100644 index 000000000..c63daa817 --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.java @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer; + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer} + * + * @author Eduardo Alonso {@literal } + */ +public class UnicodeWhitespaceTokenizerBuilder extends TokenizerBuilder { + + /** {@inheritDoc} */ + @Override + public UnicodeWhitespaceTokenizer buildTokenizer() { + return new UnicodeWhitespaceTokenizer(); + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java new file mode 100644 index 000000000..a5f907444 --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.core.WhitespaceTokenizer} + * + * @author Eduardo Alonso {@literal } + */ +public class WhitespaceTokenizerBuilder extends TokenizerBuilder { + + /** {@inheritDoc} */ + @Override + public WhitespaceTokenizer buildTokenizer() { + return new WhitespaceTokenizer(); + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java new file mode 100644 index 000000000..1afd5a52e --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; + +import java.util.Collections; +import java.util.Set; + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.wikipedia.WikipediaTokenizer} + * + * @author Eduardo Alonso {@literal } + */ +public class WikipediaTokenizerBuilder extends TokenizerBuilder { + + static final TokenOutputValue DEFAULT_TOKEN_OUTPUT = TokenOutputValue.TOKENS_ONLY; + static final Set DEFAULT_UNTOKENIZED_TYPES = Collections.emptySet(); + + /** this tokenizer output, only untokenized, only tokens or both */ + @JsonProperty("token_output") + final TokenOutputValue tokenOutput; + /** //TODO */ + @JsonProperty("untokenized_types") + final Set untokenizedTypes; + + /** + * Builds a new {@link WikipediaTokenizerBuilder} using the specified tokenOutput and untokenizedTypes. + * + * @param tokenOutput this tokenizer output, only untokenized, only tokens or both + * @param untokenizedTypes //TODO + */ + @JsonCreator + public WikipediaTokenizerBuilder(@JsonProperty("token_output") TokenOutputValue tokenOutput, + @JsonProperty("untokenized_types") Set untokenizedTypes) { + this.tokenOutput = getOrDefault(tokenOutput, DEFAULT_TOKEN_OUTPUT); + this.untokenizedTypes = getOrDefault(untokenizedTypes, DEFAULT_UNTOKENIZED_TYPES); + } + + /** {@inheritDoc} */ + @Override + public WikipediaTokenizer buildTokenizer() { + return new WikipediaTokenizer(tokenOutput.getIntegerValue(), untokenizedTypes); + } + + public enum TokenOutputValue { + + TOKENS_ONLY("TOKENS_ONLY", WikipediaTokenizer.TOKENS_ONLY), + UNTOKENIZED_ONLY("UNTOKENIZED_ONLY", WikipediaTokenizer.UNTOKENIZED_ONLY), + BOTH("BOTH", WikipediaTokenizer.BOTH); + + private int integerValue; + private String stringValue; + + TokenOutputValue(String name, int value) { + this.stringValue = name; + this.integerValue = value; + } + + @JsonCreator + public static TokenOutputValue create(String value) { + if (value == null) { + throw new IllegalArgumentException(); + } + for (TokenOutputValue v : values()) { + if (v.getStringValue().equals(value)) { + return v; + } + } + throw new IllegalArgumentException(); + } + + public int getIntegerValue() { + return integerValue; + } + + public String getStringValue() { + return stringValue; + } + } +} \ No newline at end of file diff --git a/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java b/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java new file mode 100644 index 000000000..ca7c914df --- /dev/null +++ b/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java @@ -0,0 +1,262 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import com.google.common.collect.Sets; +import com.stratio.cassandra.lucene.common.JsonSerializer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.core.LetterTokenizer; +import org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.analysis.path.PathHierarchyTokenizer; +import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; +import org.apache.lucene.analysis.pattern.PatternTokenizer; +import org.apache.lucene.analysis.standard.ClassicTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer; +import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; +import org.junit.Test; + +import java.io.IOException; + +import static org.junit.Assert.*; + +/** + * @author Eduardo Alonso {@literal } + */ +public class TokenizerBuilderTest { + + private T assertBuilderAndTokenizer(String json, Class expectedBuilderClass, Class expectedTokenizerClass) { + try { + TokenizerBuilder abstractBuilder = JsonSerializer.fromString(json, TokenizerBuilder.class); + assertEquals("Expected " + expectedBuilderClass.getName() + " class", expectedBuilderClass, abstractBuilder.getClass()); + Tokenizer tokenizer = abstractBuilder.buildTokenizer(); + assertEquals("Expected " + expectedTokenizerClass.getName() + " class", expectedTokenizerClass, tokenizer.getClass()); + return (T) abstractBuilder; + } catch (Exception e) { + fail(e.getLocalizedMessage()); + return null; + } + } + private void assertJsonParseFail(String json) throws IOException { + JsonSerializer.fromString(json, TokenizerBuilder.class); + } + + private void assertJsonParseFail(String json, String message) { + try { + JsonSerializer.fromString(json, TokenizerBuilder.class); + } catch (IOException e) { + assertEquals("Expected IOException with message: " + message + " but received: " + e.getMessage() + " localMess: " + e.getLocalizedMessage(), message, e.getMessage()); + } + assertFalse("Parsing: " + json + " must generate an IOException with message: " + message + " but does not.", true); + } + + private void assertExactValue(String paramName, Object expected, Object received) { + assertEquals("Expected " + paramName + " equals to " + expected.toString() + " but received: " + received.toString(), expected, received); + } + + @Test + public void testClassicTokenizerValidJSON() { + String json = "{type: \"classic\", max_token_length: 250}"; + ClassicTokenizerBuilder builder = assertBuilderAndTokenizer(json, ClassicTokenizerBuilder.class, ClassicTokenizer.class); + assertExactValue("ClassicTokenizerBuilder.maxTokenLength", 250, builder.maxTokenLength); + } + + @Test + public void testClassicTokenizerDefaultValues() { + ClassicTokenizerBuilder builder = assertBuilderAndTokenizer("{type: \"classic\"}", ClassicTokenizerBuilder.class, ClassicTokenizer.class); + assertExactValue("ClassicTokenizerBuilder.maxTokenLength", ClassicTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH, builder.maxTokenLength); + } + + @Test(expected = IOException.class) + public void testClassicTokenizerInvalidParam() throws IOException { + assertJsonParseFail("{type: \"classic\", max_toen_length: 250}"); + } + + @Test + public void testKeywordTokenizerValidJSON() { + String json = "{type: \"keyword\", buffer_size: 256}"; + KeywordTokenizerBuilder builder = assertBuilderAndTokenizer(json, KeywordTokenizerBuilder.class, KeywordTokenizer.class); + assertExactValue("KeywordTokenizer.bufferSize", 256, builder.bufferSize); + } + + @Test + public void testKeywordTokenizerDefaultValues() { + KeywordTokenizerBuilder builder = assertBuilderAndTokenizer("{type: \"keyword\"}", KeywordTokenizerBuilder.class, KeywordTokenizer.class); + assertExactValue("ClassicTokenizerBuilder.maxTokenLength", KeywordTokenizerBuilder.DEFAULT_BUFFER_SIZE, builder.bufferSize); + } + + @Test(expected = IOException.class) + public void testKeywordTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"keyword\", bufer_size: 256}"); + } + + @Test + public void testLetterTokenizerValidJSON() { + assertBuilderAndTokenizer("{type: \"letter\"}", LetterTokenizerBuilder.class, LetterTokenizer.class); + } + + @Test + public void testNGramTokenizerValidJSON() { + String json = "{type: \"ngram\", min_gram: 1, max_gram: 1}"; + NGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, NGramTokenizerBuilder.class, NGramTokenizer.class); + assertExactValue("NGramTokenizerBuilder.min_gram", 1, builder.minGram); + assertExactValue("NGramTokenizerBuilder.max_gram", 1, builder.maxGram); + } + + @Test + public void testNGramTokenizerDefaultValues() { + String json = "{type: \"ngram\"}"; + NGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, NGramTokenizerBuilder.class, NGramTokenizer.class); + assertExactValue("NGramTokenizerBuilder.min_gram", NGramTokenizerBuilder.DEFAULT_MIN_GRAM, builder.minGram); + assertExactValue("NGramTokenizerBuilder.max_gram", NGramTokenizerBuilder.DEFAULT_MAX_GRAM, builder.maxGram); + } + + @Test(expected = IOException.class) + public void testNGramTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"ngram\", min_am: 1, max_gram: 1}"); + } + + @Test + public void testPathHierarchyTokenizerValidJSON() { + String json = "{type: \"path_hierarchy\", buffer_size: 246, delimiter: \"$\", replacement: \"%\", skip: 3}"; + PathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, PathHierarchyTokenizerBuilder.class, PathHierarchyTokenizer.class); + assertExactValue("PathHierarchyTokenizerBuilder.buffer_size", 246, builder.bufferSize); + assertExactValue("PathHierarchyTokenizerBuilder.delimiter", '$', builder.delimiter); + assertExactValue("PathHierarchyTokenizerBuilder.replacement", '%', builder.replacement); + assertExactValue("PathHierarchyTokenizerBuilder.skip", 3, builder.skip); + } + + @Test + public void testPathHierarchyTokenizerDefaultValues() { + String json = "{type: \"path_hierarchy\"}"; + PathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, PathHierarchyTokenizerBuilder.class, PathHierarchyTokenizer.class); + assertExactValue("PathHierarchyTokenizerBuilder.buffer_size", PathHierarchyTokenizerBuilder.DEFAULT_BUFFER_SIZE, builder.bufferSize); + assertExactValue("PathHierarchyTokenizerBuilder.delimiter", PathHierarchyTokenizerBuilder.DEFAULT_DELIMITER, builder.delimiter); + assertExactValue("PathHierarchyTokenizerBuilder.replacement", PathHierarchyTokenizerBuilder.DEFAULT_REPLACEMENT, builder.replacement); + assertExactValue("PathHierarchyTokenizerBuilder.skip", PathHierarchyTokenizerBuilder.DEFAULT_SKIP, builder.skip); + } + + @Test(expected = IOException.class) + public void testPathHierarchyTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"path_hierarchy\", buffer_size: 246, delimter: \"$\", replacement: \"%\", skip: 3}"); + } + + @Test + public void testPatternTokenizerValidJSON() { + String json = "{type: \"pattern\", pattern: \"[a-z]\", flags: 35, group: 0}"; + PatternTokenizerBuilder builder = assertBuilderAndTokenizer(json, PatternTokenizerBuilder.class, PatternTokenizer.class); + assertExactValue("PathHierarchyTokenizerBuilder.pattern", "[a-z]", builder.pattern); + assertExactValue("PathHierarchyTokenizerBuilder.flags", 35, builder.flags); + assertExactValue("PathHierarchyTokenizerBuilder.group", 0, builder.group); + } + + @Test + public void testPatternTokenizerDefaultValues() { + String json = "{type: \"pattern\"}"; + PatternTokenizerBuilder builder = assertBuilderAndTokenizer(json, PatternTokenizerBuilder.class, PatternTokenizer.class); + assertExactValue("PathHierarchyTokenizerBuilder.pattern", PatternTokenizerBuilder.DEFAULT_PATTERN, builder.pattern); + assertExactValue("PathHierarchyTokenizerBuilder.group", PatternTokenizerBuilder.DEFAULT_GROUP, builder.group); + assertExactValue("PathHierarchyTokenizerBuilder.group", PatternTokenizerBuilder.DEFAULT_FLAGS, builder.flags); + } + + @Test(expected = IOException.class) + public void testPatternTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"pattern\", paern: \"[a-z]\", flags: 35, group: 0}"); + } + + @Test + public void testReversePathHierarchyTokenizerValidJSON() { + String json = "{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacement: \"%\", skip: 3}"; + ReversePathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, ReversePathHierarchyTokenizerBuilder.class, ReversePathHierarchyTokenizer.class); + assertExactValue("ReversePathHierarchyTokenizerBuilder.buffer_size", 246, builder.bufferSize); + assertExactValue("ReversePathHierarchyTokenizerBuilder.delimiter", '/', builder.delimiter); + assertExactValue("ReversePathHierarchyTokenizerBuilder.replacement", '%', builder.replacement); + assertExactValue("ReversePathHierarchyTokenizerBuilder.skip", 3, builder.skip); + } + + @Test + public void testReversePathHierarchyTokenizerDefaultValues() { + String json = "{type: \"reverse_path_hierarchy\"}"; + ReversePathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, ReversePathHierarchyTokenizerBuilder.class, ReversePathHierarchyTokenizer.class); + assertExactValue("PathHierarchyTokenizerBuilder.buffer_size", ReversePathHierarchyTokenizerBuilder.DEFAULT_BUFFER_SIZE, builder.bufferSize); + assertExactValue("PathHierarchyTokenizerBuilder.delimiter", ReversePathHierarchyTokenizerBuilder.DEFAULT_DELIMITER, builder.delimiter); + assertExactValue("PathHierarchyTokenizerBuilder.replacement", ReversePathHierarchyTokenizerBuilder.DEFAULT_REPLACEMENT, builder.replacement); + assertExactValue("PathHierarchyTokenizerBuilder.skip", ReversePathHierarchyTokenizerBuilder.DEFAULT_SKIP, builder.skip); + } + + @Test(expected = IOException.class) + public void testReversePathHierarchyTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacent: \"%\", skip: 3}"); + } + + @Test + public void testStandardTokenizerValidJSON() { + String json = "{type: \"standard\", max_token_length: 246}"; + StandardTokenizerBuilder builder = assertBuilderAndTokenizer(json, StandardTokenizerBuilder.class, StandardTokenizer.class); + assertExactValue("StandardTokenizerBuilder.maxTokenLength", 246, builder.maxTokenLength); + } + + @Test + public void testStandardTokenizerDefaultValues() { + StandardTokenizerBuilder builder = assertBuilderAndTokenizer("{type: \"standard\"}", StandardTokenizerBuilder.class, StandardTokenizer.class); + assertExactValue("ClassicTokenizerBuilder.maxTokenLength", StandardTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH, builder.maxTokenLength); + } + + @Test(expected = IOException.class) + public void testStandardTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"standard\", max_token_ngth: 246}"); + } + + @Test + public void testUAX29URLEmailTokenizerValidJSON() { + String json = "{type: \"uax29_url_email\", max_token_length: 249}"; + UAX29URLEmailTokenizerBuilder builder = assertBuilderAndTokenizer(json, UAX29URLEmailTokenizerBuilder.class, UAX29URLEmailTokenizer.class); + assertExactValue("UAX29URLEmailTokenizerBuilder.maxTokenLength", 249, builder.maxTokenLength); + } + + @Test + public void testUAX29URLEmailTokenizerDefaultValues() { + String json = "{type: \"uax29_url_email\"}"; + UAX29URLEmailTokenizerBuilder builder = assertBuilderAndTokenizer(json, UAX29URLEmailTokenizerBuilder.class, UAX29URLEmailTokenizer.class); + assertExactValue("UAX29URLEmailTokenizerBuilder.maxTokenLength", UAX29URLEmailTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH, builder.maxTokenLength); + } + + @Test(expected = IOException.class) + public void testUAX29URLEmailTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"uax29_url_email\", max_token_lgth: 249}"); + } + + @Test + public void testUnicodeWhitespaceTokenizerValidJSON() { + String json = "{type:\"unicode_whitespace\"}"; + assertBuilderAndTokenizer(json, UnicodeWhitespaceTokenizerBuilder.class, UnicodeWhitespaceTokenizer.class); + } + + @Test + public void testWhitespaceTokenizerValidJSON() { + String json = "{type:\"whitespace\"}"; + assertBuilderAndTokenizer(json, WhitespaceTokenizerBuilder.class, WhitespaceTokenizer.class); + } + + @Test + public void testWikipediaTokenizerValidJSON() { + String json = "{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untokenized_types : [\"aaa\",\"bbb\"]}"; + WikipediaTokenizerBuilder builder = assertBuilderAndTokenizer(json, WikipediaTokenizerBuilder.class, WikipediaTokenizer.class); + assertExactValue("WikipediaTokenizerBuilder.token_output", WikipediaTokenizerBuilder.TokenOutputValue.TOKENS_ONLY, builder.tokenOutput); + assertExactValue("WikipediaTokenizerBuilder.untokenized_types", Sets.newHashSet("aaa", "bbb"), builder.untokenizedTypes); + } + + @Test + public void testWikipediaTokenizerDefaultValues() { + String json = "{type: \"wikipedia\"}"; + WikipediaTokenizerBuilder builder = assertBuilderAndTokenizer(json, WikipediaTokenizerBuilder.class, WikipediaTokenizer.class); + assertExactValue("WikipediaTokenizerBuilder.token_output", WikipediaTokenizerBuilder.TokenOutputValue.TOKENS_ONLY, builder.tokenOutput); + assertExactValue("WikipediaTokenizerBuilder.untokenized_types", Sets.newHashSet(), builder.untokenizedTypes); + } + + @Test(expected = IOException.class) + public void testWikipediaTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untoknized_types : [\"aaa\",\"bbb\"]}"); + } +} From 64aee429da533ce7715d1ada240b81704fa6218f Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Wed, 26 Apr 2017 12:46:00 +0200 Subject: [PATCH 02/40] Add lowercase, edgeNGram and thai tokenizers --- .../schema/analysis/CustomAnalyzer.java | 2 +- .../tokenizer/EdgeNGramTokenizerBuilder.java | 57 +++++++++++++++++++ .../tokenizer/LowerCaseTokenizerBuilder.java | 32 +++++++++++ .../tokenizer/ThaiTokenizerBuilder.java | 30 ++++++++++ .../analysis/tokenizer/TokenizerBuilder.java | 3 + .../tokenizer/WikipediaTokenizerBuilder.java | 53 ++++++++--------- .../tokenizer/TokenizerBuilderTest.java | 43 ++++++++++++-- 7 files changed, 189 insertions(+), 31 deletions(-) create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java create mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java index 647b0f213..8f6dda451 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java @@ -19,7 +19,7 @@ public CustomAnalyzer(Tokenizer tokenizer) { } @Override - protected TokenStreamComponents createComponents(String fieldName) { + protected Analyzer.TokenStreamComponents createComponents(String fieldName) { TokenStream ts = tokenizer; diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java new file mode 100644 index 000000000..5c725e3a6 --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; + +/** + * A {@link EdgeNGramTokenizerBuilder} for building {@link org.apache.lucene.analysis.ngram.EdgeNGramTokenizer} + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class EdgeNGramTokenizerBuilder extends TokenizerBuilder{ + + /** the smallest n-gram to generate */ + @JsonProperty("min_gram") + final Integer minGram; + + /** the largest n-gram to generate */ + @JsonProperty("max_gram") + final Integer maxGram; + + /** + * Builds a new {@link EdgeNGramTokenizerBuilder} using the specified minGram and manGram. + * + * @param minGram the smallest n-gram to generate + * @param minGram the largest n-gram to generate + */ + @JsonCreator + public EdgeNGramTokenizerBuilder(@JsonProperty("min_gram") Integer minGram, + @JsonProperty("max_gram") Integer maxGram) { + this.minGram = getOrDefault(minGram, EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE); + this.maxGram = getOrDefault(maxGram, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); + } + + /** {@inheritDoc} */ + @Override + public EdgeNGramTokenizer buildTokenizer() { + return new EdgeNGramTokenizer(minGram, maxGram); + } + + +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java new file mode 100644 index 000000000..bfb7b622a --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import org.apache.lucene.analysis.core.LowerCaseTokenizer; + +/** + * A {@link LowerCaseTokenizerBuilder} for building {@link org.apache.lucene.analysis.core.LowerCaseTokenizer} + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class LowerCaseTokenizerBuilder extends TokenizerBuilder{ + + /** {@inheritDoc} */ + @Override + public LowerCaseTokenizer buildTokenizer() { + return new LowerCaseTokenizer(); + } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java new file mode 100644 index 000000000..73902188e --- /dev/null +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer; + +import org.apache.lucene.analysis.th.ThaiTokenizer; + +/** + * A {@link ThaiTokenizer} for building {@link org.apache.lucene.analysis.th.ThaiTokenizer} + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class ThaiTokenizerBuilder extends TokenizerBuilder { + + /** {@inheritDoc} */ + @Override + public ThaiTokenizer buildTokenizer() {return new ThaiTokenizer(); } +} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java index e7146da68..79c4bd644 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java @@ -24,8 +24,10 @@ */ @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type") @JsonSubTypes({@JsonSubTypes.Type(value = ClassicTokenizerBuilder.class, name = "classic"), + @JsonSubTypes.Type(value = EdgeNGramTokenizerBuilder.class, name = "edge_ngram"), @JsonSubTypes.Type(value = KeywordTokenizerBuilder.class, name = "keyword"), @JsonSubTypes.Type(value = LetterTokenizerBuilder.class, name = "letter"), + @JsonSubTypes.Type(value = LowerCaseTokenizerBuilder.class, name = "lower_case"), @JsonSubTypes.Type(value = NGramTokenizerBuilder.class, name = "ngram"), @JsonSubTypes.Type(value = PathHierarchyTokenizerBuilder.class, name = "path_hierarchy"), @JsonSubTypes.Type(value = PatternTokenizerBuilder.class, name = "pattern"), @@ -33,6 +35,7 @@ @JsonSubTypes.Type(value = StandardTokenizerBuilder.class, name = "standard"), @JsonSubTypes.Type(value = UAX29URLEmailTokenizerBuilder.class, name = "uax29_url_email"), @JsonSubTypes.Type(value = UnicodeWhitespaceTokenizerBuilder.class, name = "unicode_whitespace"), + @JsonSubTypes.Type(value = ThaiTokenizerBuilder.class, name = "thai"), @JsonSubTypes.Type(value = WhitespaceTokenizerBuilder.class, name = "whitespace"), @JsonSubTypes.Type(value = WikipediaTokenizerBuilder.class, name = "wikipedia")}) public abstract class TokenizerBuilder { diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java index 1afd5a52e..8c15f38cc 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java @@ -32,32 +32,6 @@ public class WikipediaTokenizerBuilder extends TokenizerBuilder DEFAULT_UNTOKENIZED_TYPES = Collections.emptySet(); - /** this tokenizer output, only untokenized, only tokens or both */ - @JsonProperty("token_output") - final TokenOutputValue tokenOutput; - /** //TODO */ - @JsonProperty("untokenized_types") - final Set untokenizedTypes; - - /** - * Builds a new {@link WikipediaTokenizerBuilder} using the specified tokenOutput and untokenizedTypes. - * - * @param tokenOutput this tokenizer output, only untokenized, only tokens or both - * @param untokenizedTypes //TODO - */ - @JsonCreator - public WikipediaTokenizerBuilder(@JsonProperty("token_output") TokenOutputValue tokenOutput, - @JsonProperty("untokenized_types") Set untokenizedTypes) { - this.tokenOutput = getOrDefault(tokenOutput, DEFAULT_TOKEN_OUTPUT); - this.untokenizedTypes = getOrDefault(untokenizedTypes, DEFAULT_UNTOKENIZED_TYPES); - } - - /** {@inheritDoc} */ - @Override - public WikipediaTokenizer buildTokenizer() { - return new WikipediaTokenizer(tokenOutput.getIntegerValue(), untokenizedTypes); - } - public enum TokenOutputValue { TOKENS_ONLY("TOKENS_ONLY", WikipediaTokenizer.TOKENS_ONLY), @@ -93,4 +67,31 @@ public String getStringValue() { return stringValue; } } + /** this tokenizer output, only untokenized, only tokens or both */ + @JsonProperty("token_output") + final TokenOutputValue tokenOutput; + /** //TODO */ + @JsonProperty("untokenized_types") + final Set untokenizedTypes; + + /** + * Builds a new {@link WikipediaTokenizerBuilder} using the specified tokenOutput and untokenizedTypes. + * + * @param tokenOutput this tokenizer output, only untokenized, only tokens or both + * @param untokenizedTypes //TODO + */ + @JsonCreator + public WikipediaTokenizerBuilder(@JsonProperty("token_output") WikipediaTokenizerBuilder.TokenOutputValue tokenOutput, + @JsonProperty("untokenized_types") Set untokenizedTypes) { + this.tokenOutput = getOrDefault(tokenOutput, DEFAULT_TOKEN_OUTPUT); + this.untokenizedTypes = getOrDefault(untokenizedTypes, DEFAULT_UNTOKENIZED_TYPES); + } + + /** {@inheritDoc} */ + @Override + public WikipediaTokenizer buildTokenizer() { + return new WikipediaTokenizer(tokenOutput.getIntegerValue(), untokenizedTypes); + } + + } \ No newline at end of file diff --git a/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java b/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java index ca7c914df..3e59bf21d 100644 --- a/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java +++ b/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java @@ -3,10 +3,8 @@ import com.google.common.collect.Sets; import com.stratio.cassandra.lucene.common.JsonSerializer; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.core.LetterTokenizer; -import org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.core.*; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.analysis.path.PathHierarchyTokenizer; import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; @@ -14,6 +12,7 @@ import org.apache.lucene.analysis.standard.ClassicTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer; +import org.apache.lucene.analysis.th.ThaiTokenizer; import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; import org.junit.Test; @@ -96,6 +95,16 @@ public void testLetterTokenizerValidJSON() { assertBuilderAndTokenizer("{type: \"letter\"}", LetterTokenizerBuilder.class, LetterTokenizer.class); } + @Test + public void testLowerCaseTokenizerValidJSON() { + assertBuilderAndTokenizer("{type: \"lower_case\"}", LowerCaseTokenizerBuilder.class, LowerCaseTokenizer.class); + } + + @Test + public void testThaiTokenizerValidJSON() { + assertBuilderAndTokenizer("{type: \"thai\"}", ThaiTokenizerBuilder.class, ThaiTokenizer.class); + } + @Test public void testNGramTokenizerValidJSON() { String json = "{type: \"ngram\", min_gram: 1, max_gram: 1}"; @@ -117,6 +126,27 @@ public void testNGramTokenizerInvalidJSON() throws IOException { assertJsonParseFail("{type: \"ngram\", min_am: 1, max_gram: 1}"); } + @Test + public void testEdgeNGramTokenizerValidJSON() { + String json = "{type: \"edge_ngram\", min_gram: 1, max_gram: 1}"; + EdgeNGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, EdgeNGramTokenizerBuilder.class, EdgeNGramTokenizer.class); + assertExactValue("EdgeNGramTokenizerBuilder.min_gram", 1, builder.minGram); + assertExactValue("EdgeNGramTokenizerBuilder.max_gram", 1, builder.maxGram); + } + + @Test + public void testEdgeNGramTokenizerDefaultValues() { + String json = "{type: \"edge_ngram\"}"; + EdgeNGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, EdgeNGramTokenizerBuilder.class, EdgeNGramTokenizer.class); + assertExactValue("EdgeNGramTokenizerBuilder.min_gram", EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, builder.minGram); + assertExactValue("EdgeNGramTokenizerBuilder.max_gram", EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE, builder.maxGram); + } + + @Test(expected = IOException.class) + public void testEdgeNGramTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"edge_ngram\", min_am: 1, max_gram: 1}"); + } + @Test public void testPathHierarchyTokenizerValidJSON() { String json = "{type: \"path_hierarchy\", buffer_size: 246, delimiter: \"$\", replacement: \"%\", skip: 3}"; @@ -259,4 +289,9 @@ public void testWikipediaTokenizerDefaultValues() { public void testWikipediaTokenizerInvalidJSON() throws IOException { assertJsonParseFail("{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untoknized_types : [\"aaa\",\"bbb\"]}"); } + + @Test(expected = IOException.class) + public void testInvalidTokenizerType() throws IOException { + assertJsonParseFail("{type: \"invalid_type\"}"); + } } From 1eec1d55605fed17882ab6d33ddb2cc332dc257d Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Wed, 26 Apr 2017 14:30:20 +0200 Subject: [PATCH 03/40] Reformat code --- .../schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java | 5 ++--- .../schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java | 2 +- .../analysis/tokenizer/PathHierarchyTokenizerBuilder.java | 3 ++- .../tokenizer/ReversePathHierarchyTokenizerBuilder.java | 3 ++- .../schema/analysis/tokenizer/ThaiTokenizerBuilder.java | 4 +++- .../lucene/schema/analysis/tokenizer/TokenizerBuilder.java | 2 +- .../analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java | 2 +- .../schema/analysis/tokenizer/WikipediaTokenizerBuilder.java | 2 +- 8 files changed, 13 insertions(+), 10 deletions(-) diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java index 5c725e3a6..c4e1de701 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java @@ -24,7 +24,7 @@ * * @author Juan Pedro Gilaberte {@literal } */ -public class EdgeNGramTokenizerBuilder extends TokenizerBuilder{ +public class EdgeNGramTokenizerBuilder extends TokenizerBuilder { /** the smallest n-gram to generate */ @JsonProperty("min_gram") @@ -42,7 +42,7 @@ public class EdgeNGramTokenizerBuilder extends TokenizerBuilder} */ -public class LowerCaseTokenizerBuilder extends TokenizerBuilder{ +public class LowerCaseTokenizerBuilder extends TokenizerBuilder { /** {@inheritDoc} */ @Override diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java index 33b0eac58..59c8169d4 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java @@ -48,7 +48,8 @@ public class PathHierarchyTokenizerBuilder extends TokenizerBuilder { /** {@inheritDoc} */ @Override - public ThaiTokenizer buildTokenizer() {return new ThaiTokenizer(); } + public ThaiTokenizer buildTokenizer() { + return new ThaiTokenizer(); + } } diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java index 79c4bd644..0cc0b3e2f 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java @@ -54,7 +54,7 @@ public abstract class TokenizerBuilder { * @return if (param!=null) { return param; }else{ return defaultParam; } */ public static T getOrDefault(T param, T defaultParam) { - if (param==null) { + if (param == null) { return defaultParam; } else { return param; diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java index 35ad21e63..f5964fa1e 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java @@ -39,7 +39,7 @@ public class UAX29URLEmailTokenizerBuilder extends TokenizerBuilder Date: Fri, 28 Apr 2017 13:59:15 +0200 Subject: [PATCH 04/40] Add tokenizers in builder module --- builder/pom.xml | 6 + .../analysis/tokenizer/ClassicTokenizer.java | 45 +++ .../tokenizer/EdgeNGramTokenizer.java | 52 +++ .../analysis/tokenizer/KeywordTokenizer.java | 45 +++ .../analysis/tokenizer/LetterTokenizer.java | 35 ++ .../tokenizer/LowerCaseTokenizer.java | 36 ++ .../analysis/tokenizer/NGramTokenizer.java | 53 +++ .../tokenizer/PathHierarchyTokenizer.java | 69 ++++ .../analysis/tokenizer/PatternTokenizer.java | 61 ++++ .../ReversePathHierarchyTokenizer.java | 70 ++++ .../analysis/tokenizer/StandardTokenizer.java | 45 +++ .../analysis/tokenizer/ThaiTokenizer.java | 36 ++ .../schema/analysis/tokenizer/Tokenizer.java | 57 ++++ .../tokenizer/UAX29URLEmailTokenizer.java | 46 +++ .../tokenizer/UnicodeWhitespaceTokenizer.java | 35 ++ .../tokenizer/WhitespaceTokenizer.java | 35 ++ .../tokenizer/WikipediaTokenizer.java | 95 ++++++ .../analysis/tokenizer/TokenizerTest.java | 322 ++++++++++++++++++ .../tokenizer/EdgeNGramTokenizerBuilder.java | 7 +- .../tokenizer/LetterTokenizerBuilder.java | 8 + .../tokenizer/LowerCaseTokenizerBuilder.java | 8 + .../tokenizer/ThaiTokenizerBuilder.java | 8 + .../tokenizer/WhitespaceTokenizerBuilder.java | 8 + .../tokenizer/TokenizerBuilderTest.java | 200 ++++++++--- 24 files changed, 1330 insertions(+), 52 deletions(-) create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ClassicTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/EdgeNGramTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/LetterTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/LowerCaseTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ReversePathHierarchyTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/StandardTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ThaiTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/Tokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UAX29URLEmailTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UnicodeWhitespaceTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WhitespaceTokenizer.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java create mode 100644 builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/TokenizerTest.java diff --git a/builder/pom.xml b/builder/pom.xml index 81cf619b1..e31776023 100644 --- a/builder/pom.xml +++ b/builder/pom.xml @@ -51,5 +51,11 @@ 4.12 test + + com.stratio.cassandra + cassandra-lucene-index-plugin + ${project.version} + test + diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ClassicTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ClassicTokenizer.java new file mode 100644 index 000000000..a73383193 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ClassicTokenizer.java @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class ClassicTokenizer extends Tokenizer { + + static final Integer DEFAULT_MAX_TOKEN_LENGTH = 255; + + /** If a token length is bigger that this, token is split at max token length intervals. */ + @JsonProperty("max_token_length") + final Integer maxTokenLength; + + /** + * Builds a new {@link ClassicTokenizer} using the specified maxTokenLength. + * + * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. + */ + @JsonCreator + public ClassicTokenizer(@JsonProperty("max_token_length") Integer maxTokenLength) { + this.maxTokenLength = getOrDefault(maxTokenLength, DEFAULT_MAX_TOKEN_LENGTH); + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/EdgeNGramTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/EdgeNGramTokenizer.java new file mode 100644 index 000000000..1eec2ec05 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/EdgeNGramTokenizer.java @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class EdgeNGramTokenizer extends Tokenizer { + static final Integer DEFAULT_MIN_GRAM = 1; + static final Integer DEFAULT_MAX_GRAM = 2; + + /** the smallest n-gram to generate */ + @JsonProperty("min_gram") + final Integer minGram; + + /** the largest n-gram to generate */ + @JsonProperty("max_gram") + final Integer maxGram; + + /** + * Builds a new {@link EdgeNGramTokenizer} using the specified minGram and manGram. + * + * @param minGram the smallest n-gram to generate + * @param minGram the largest n-gram to generate + */ + @JsonCreator + public EdgeNGramTokenizer(@JsonProperty("min_gram") Integer minGram, + @JsonProperty("max_gram") Integer maxGram) { + this.minGram = getOrDefault(minGram, DEFAULT_MIN_GRAM); + this.maxGram = getOrDefault(maxGram, DEFAULT_MAX_GRAM); + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java new file mode 100644 index 000000000..263bf9169 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class KeywordTokenizer extends Tokenizer { + + static final Integer DEFAULT_BUFFER_SIZE = 256; + + /** terms cache read buffer size */ + @JsonProperty("buffer_size") + final Integer bufferSize; + + /** + * Builds a new {@link KeywordTokenizer} using the specified buffer_size. + * + * @param bufferSize the terms cache read buffer size + */ + @JsonCreator + public KeywordTokenizer(@JsonProperty("buffer_size") Integer bufferSize) { + this.bufferSize = getOrDefault(bufferSize, DEFAULT_BUFFER_SIZE); + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/LetterTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/LetterTokenizer.java new file mode 100644 index 000000000..dab8cd458 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/LetterTokenizer.java @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class LetterTokenizer extends Tokenizer { + + /** + * Builds a new {@link LetterTokenizer}. + */ + @JsonCreator + public LetterTokenizer() { + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/LowerCaseTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/LowerCaseTokenizer.java new file mode 100644 index 000000000..fb1f1a348 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/LowerCaseTokenizer.java @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class LowerCaseTokenizer extends Tokenizer { + + /** + * Builds a new {@link LowerCaseTokenizer} + */ + @JsonCreator + public LowerCaseTokenizer() { + } + +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java new file mode 100644 index 000000000..f10abf600 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class NGramTokenizer extends Tokenizer { + + static final Integer DEFAULT_MIN_GRAM = 1; + static final Integer DEFAULT_MAX_GRAM = 2; + + /** the smallest n-gram to generate */ + @JsonProperty("min_gram") + final Integer minGram; + + /** the largest n-gram to generate */ + @JsonProperty("max_gram") + final Integer maxGram; + + /** + * Builds a new {@link NGramTokenizer} using the specified minGram and manGram. + * + * @param minGram the smallest n-gram to generate + * @param minGram the largest n-gram to generate + */ + @JsonCreator + public NGramTokenizer(@JsonProperty("min_gram") Integer minGram, + @JsonProperty("max_gram") Integer maxGram) { + this.minGram = getOrDefault(minGram, DEFAULT_MIN_GRAM); + this.maxGram = getOrDefault(maxGram, DEFAULT_MAX_GRAM); + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java new file mode 100644 index 000000000..bbb3419fe --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class PathHierarchyTokenizer extends Tokenizer { + + static final Integer DEFAULT_BUFFER_SIZE = 1024; + static final Character DEFAULT_DELIMITER = '/'; + static final Character DEFAULT_REPLACEMENT = '/'; + static final Integer DEFAULT_SKIP = 0; + + /** terms cache read buffer size */ + @JsonProperty("buffer_size") + final Integer bufferSize; + + /** path separator */ + @JsonProperty("delimiter") + final Character delimiter; + + /** a replacement character for delimiter */ + @JsonProperty("replacement") + final Character replacement; + + /** number of initial tokens to skip */ + @JsonProperty("skip") + final Integer skip; + + /** + * Builds a new {@link PathHierarchyTokenizer} using the specified bufferSize, delimiter, replacement and skip. + * + * @param bufferSize terms cache read buffer size + * @param delimiter path separator + * @param replacement a replacement character for delimiter + * @param skip number of initial tokens to skip + */ + @JsonCreator + public PathHierarchyTokenizer(@JsonProperty("buffer_size") Integer bufferSize, + @JsonProperty("delimiter") Character delimiter, + @JsonProperty("replacement") Character replacement, + @JsonProperty("skip") Integer skip) { + this.bufferSize = getOrDefault(bufferSize, DEFAULT_BUFFER_SIZE); + this.delimiter = getOrDefault(delimiter, DEFAULT_DELIMITER); + this.replacement = getOrDefault(replacement, DEFAULT_REPLACEMENT); + this.skip = getOrDefault(skip, DEFAULT_SKIP); + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java new file mode 100644 index 000000000..28723c857 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class PatternTokenizer extends Tokenizer { + + static final String DEFAULT_PATTERN = "\\W+"; + static final Integer DEFAULT_FLAGS = 0; + static final Integer DEFAULT_GROUP = -1; + + /** java regular expression */ + @JsonProperty("pattern") + final String pattern; + + /** java regular expression flags */ + @JsonProperty("flags") + final Integer flags; + + /** which pattern group to use to generate tokens (-1 for split) */ + @JsonProperty("group") + final Integer group; + + /** + * Builds a new {@link PatternTokenizer} using the specified pattern, flags, and group. + * + * @param pattern java regular expression + * @param flags java regular expression flags + * @param group a pattern group to use to generate tokens (-1 for split) + */ + @JsonCreator + public PatternTokenizer(@JsonProperty("pattern") String pattern, + @JsonProperty("flags") Integer flags, + @JsonProperty("group") Integer group) { + this.pattern = getOrDefault(pattern, DEFAULT_PATTERN); + this.flags = getOrDefault(flags, DEFAULT_FLAGS); + this.group = getOrDefault(group, DEFAULT_GROUP); + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ReversePathHierarchyTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ReversePathHierarchyTokenizer.java new file mode 100644 index 000000000..5d0652fca --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ReversePathHierarchyTokenizer.java @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class ReversePathHierarchyTokenizer extends Tokenizer { + + static final Integer DEFAULT_BUFFER_SIZE = 1024; + static final Character DEFAULT_DELIMITER = '/'; + static final Character DEFAULT_REPLACEMENT = '/'; + static final Integer DEFAULT_SKIP = 0; + + /** terms cache read buffer size */ + @JsonProperty("buffer_size") + final Integer bufferSize; + + /** path separator */ + @JsonProperty("delimiter") + final Character delimiter; + + /** a replacement character for delimiter */ + @JsonProperty("replacement") + final Character replacement; + + /** number of initial tokens to skip */ + @JsonProperty("skip") + final Integer skip; + + /** + * Builds a new {@link ReversePathHierarchyTokenizer} using the specified bufferSize, delimiter, replacement and + * skip. + * + * @param bufferSize terms cache read buffer size + * @param delimiter path separator + * @param replacement a replacement character for delimiter + * @param skip number of initial tokens to skip + */ + @JsonCreator + public ReversePathHierarchyTokenizer(@JsonProperty("buffer_size") Integer bufferSize, + @JsonProperty("delimiter") Character delimiter, + @JsonProperty("replacement") Character replacement, + @JsonProperty("skip") Integer skip) { + this.bufferSize = getOrDefault(bufferSize, DEFAULT_BUFFER_SIZE); + this.delimiter = getOrDefault(delimiter, DEFAULT_DELIMITER); + this.replacement = getOrDefault(replacement, DEFAULT_REPLACEMENT); + this.skip = getOrDefault(skip, DEFAULT_SKIP); + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/StandardTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/StandardTokenizer.java new file mode 100644 index 000000000..cbdd65205 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/StandardTokenizer.java @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class StandardTokenizer extends Tokenizer { + + static final Integer DEFAULT_MAX_TOKEN_LENGTH = 255; + + /** If a token length is bigger that this, token is split at max token length intervals. */ + @JsonProperty("max_token_length") + final Integer maxTokenLength; + + /** + * Builds a new {@link StandardTokenizer} using the specified bufferSize, delimiter, replacement and skip. + * + * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. + */ + @JsonCreator + public StandardTokenizer(@JsonProperty("max_token_length") Integer maxTokenLength) { + this.maxTokenLength = getOrDefault(maxTokenLength, DEFAULT_MAX_TOKEN_LENGTH); + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ThaiTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ThaiTokenizer.java new file mode 100644 index 000000000..44cf9b936 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ThaiTokenizer.java @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class ThaiTokenizer extends Tokenizer { + + /** + * Builds a new {@link ThaiTokenizer} + */ + @JsonCreator + public ThaiTokenizer() { + } + +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/Tokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/Tokenizer.java new file mode 100644 index 000000000..fc4a28534 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/Tokenizer.java @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import com.stratio.cassandra.lucene.builder.JSONBuilder; + +/** + * A Lucene {@code Tokenizer}. + * + * @author jpgilaberte@stratio.com {@literal } + */ +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type") +@JsonSubTypes({@JsonSubTypes.Type(value = ClassicTokenizer.class, name = "classic"), + @JsonSubTypes.Type(value = EdgeNGramTokenizer.class, name = "edge_ngram"), + @JsonSubTypes.Type(value = KeywordTokenizer.class, name = "keyword"), + @JsonSubTypes.Type(value = LetterTokenizer.class, name = "letter"), + @JsonSubTypes.Type(value = LowerCaseTokenizer.class, name = "lower_case"), + @JsonSubTypes.Type(value = NGramTokenizer.class, name = "ngram"), + @JsonSubTypes.Type(value = PathHierarchyTokenizer.class, name = "path_hierarchy"), + @JsonSubTypes.Type(value = PatternTokenizer.class, name = "pattern"), + @JsonSubTypes.Type(value = ReversePathHierarchyTokenizer.class, name = "reverse_path_hierarchy"), + @JsonSubTypes.Type(value = StandardTokenizer.class, name = "standard"), + @JsonSubTypes.Type(value = UAX29URLEmailTokenizer.class, name = "uax29_url_email"), + @JsonSubTypes.Type(value = UnicodeWhitespaceTokenizer.class, name = "unicode_whitespace"), + @JsonSubTypes.Type(value = ThaiTokenizer.class, name = "thai"), + @JsonSubTypes.Type(value = WhitespaceTokenizer.class, name = "whitespace"), + @JsonSubTypes.Type(value = WikipediaTokenizer.class, name = "wikipedia")}) +public abstract class Tokenizer extends JSONBuilder { + /** + * @param param the main parameter. + * @param defaultParam the default parameter if main paramaeter is null. + * @param return type must extend {@link Tokenizer} + * @return if (param!=null) { return param; }else{ return defaultParam; } + */ + public static T getOrDefault(T param, T defaultParam) { + if (param == null) { + return defaultParam; + } else { + return param; + } + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UAX29URLEmailTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UAX29URLEmailTokenizer.java new file mode 100644 index 000000000..e9d485faf --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UAX29URLEmailTokenizer.java @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class UAX29URLEmailTokenizer extends Tokenizer { + + static final Integer DEFAULT_MAX_TOKEN_LENGTH = 255; + + /** If a token length is bigger that this, token is split at max token length intervals. */ + @JsonProperty("max_token_length") + final Integer maxTokenLength; + + /** + * Builds a new {@link UAX29URLEmailTokenizer} using the specified maxTokenLength. + * + * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. + */ + @JsonCreator + public UAX29URLEmailTokenizer(@JsonProperty("max_token_length") Integer maxTokenLength) { + this.maxTokenLength = getOrDefault(maxTokenLength, DEFAULT_MAX_TOKEN_LENGTH); + + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UnicodeWhitespaceTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UnicodeWhitespaceTokenizer.java new file mode 100644 index 000000000..1f5c36a2f --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UnicodeWhitespaceTokenizer.java @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class UnicodeWhitespaceTokenizer extends Tokenizer { + + /** + * Builds a new {@link LetterTokenizer}. + */ + @JsonCreator + public UnicodeWhitespaceTokenizer() { + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WhitespaceTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WhitespaceTokenizer.java new file mode 100644 index 000000000..e07cad7ca --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WhitespaceTokenizer.java @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class WhitespaceTokenizer extends Tokenizer { + + /** + * Builds a new {@link ThaiTokenizer} + */ + @JsonCreator + public WhitespaceTokenizer() { + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java new file mode 100644 index 000000000..78792d88d --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.util.Collections; +import java.util.Set; + +/** + * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. + * + * It's uses the {@code Tokenizer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class WikipediaTokenizer extends Tokenizer { + + static final TokenOutputValue DEFAULT_TOKEN_OUTPUT = TokenOutputValue.TOKENS_ONLY; + static final Set DEFAULT_UNTOKENIZED_TYPES = Collections.emptySet(); + + static final int TOKENS_ONLY_VAR = 0; + static final int UNTOKENIZED_ONLY_VAR = 1; + static final int BOTH_VAR = 2; + + public enum TokenOutputValue { + + TOKENS_ONLY("TOKENS_ONLY", TOKENS_ONLY_VAR), + UNTOKENIZED_ONLY("UNTOKENIZED_ONLY", UNTOKENIZED_ONLY_VAR), + BOTH("BOTH", BOTH_VAR); + + private int integerValue; + private String stringValue; + + TokenOutputValue(String name, int value) { + this.stringValue = name; + this.integerValue = value; + } + + @JsonCreator + public static TokenOutputValue create(String value) { + if (value == null) { + throw new IllegalArgumentException(); + } + for (TokenOutputValue v : values()) { + if (v.getStringValue().equals(value)) { + return v; + } + } + throw new IllegalArgumentException(); + } + + public int getIntegerValue() { + return integerValue; + } + + public String getStringValue() { + return stringValue; + } + } + + /** this tokenizer output, only untokenized, only tokens or both */ + @JsonProperty("token_output") + final TokenOutputValue tokenOutput; + /** //TODO */ + @JsonProperty("untokenized_types") + final Set untokenizedTypes; + + /** + * Builds a new {@link WikipediaTokenizer} using the specified tokenOutput and untokenizedTypes. + * + * @param tokenOutput this tokenizer output, only untokenized, only tokens or both + * @param untokenizedTypes //TODO + */ + @JsonCreator + public WikipediaTokenizer(@JsonProperty("token_output") WikipediaTokenizer.TokenOutputValue tokenOutput, + @JsonProperty("untokenized_types") Set untokenizedTypes) { + this.tokenOutput = getOrDefault(tokenOutput, DEFAULT_TOKEN_OUTPUT); + this.untokenizedTypes = getOrDefault(untokenizedTypes, DEFAULT_UNTOKENIZED_TYPES); + } +} \ No newline at end of file diff --git a/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/TokenizerTest.java b/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/TokenizerTest.java new file mode 100644 index 000000000..d3f4f27fd --- /dev/null +++ b/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/TokenizerTest.java @@ -0,0 +1,322 @@ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.google.common.collect.Sets; +import com.stratio.cassandra.lucene.common.JsonSerializer; +import org.junit.Test; + +import java.io.IOException; + +import static org.junit.Assert.*; + +/** + * @author Juan Pedro Gilaberte {@literal } + */ +public class TokenizerTest { + + private T assertAndTokenizer(String json, Class expectedClass) { + try { + Tokenizer abstractBuilder = JsonSerializer.fromString(json, Tokenizer.class); + assertEquals("Expected " + expectedClass.getName() + " class", expectedClass, abstractBuilder.getClass()); + return (T) abstractBuilder; + } catch (Exception e) { + fail(e.getLocalizedMessage()); + return null; + } + } + + private void assertJsonParseFail(String json) throws IOException { + JsonSerializer.fromString(json, Tokenizer.class); + } + + private void assertJsonParseFail(String json, String message) { + try { + JsonSerializer.fromString(json, Tokenizer.class); + } catch (IOException e) { + assertEquals("Expected IOException with message: " + + message + + " but received: " + + e.getMessage() + + " localMess: " + + e.getLocalizedMessage(), message, e.getMessage()); + } + assertFalse("Parsing: " + json + " must generate an IOException with message: " + message + " but does not.", + true); + } + + private void assertExactValue(String paramName, Object expected, Object received) { + assertEquals("Expected " + + paramName + + " equals to " + + expected.toString() + + " but received: " + + received.toString(), expected, received); + } + + @Test + public void testClassicTokenizerValidJSON() { + String json = "{type: \"classic\", max_token_length: 250}"; + ClassicTokenizer builder = assertAndTokenizer(json, ClassicTokenizer.class); + assertExactValue("ClassicTokenizer.maxTokenLength", 250, builder.maxTokenLength); + } + + @Test + public void testClassicTokenizerDefaultValues() { + ClassicTokenizer builder = assertAndTokenizer("{type: \"classic\"}", ClassicTokenizer.class); + assertExactValue("ClassicTokenizer.maxTokenLength", + ClassicTokenizer.DEFAULT_MAX_TOKEN_LENGTH, + builder.maxTokenLength); + } + + @Test(expected = IOException.class) + public void testClassicTokenizerInvalidParam() throws IOException { + assertJsonParseFail("{type: \"classic\", max_toen_length: 250}"); + } + + @Test + public void testKeywordTokenizerValidJSON() { + String json = "{type: \"keyword\", buffer_size: 256}"; + KeywordTokenizer builder = assertAndTokenizer(json, KeywordTokenizer.class); + assertExactValue("KeywordTokenizer.bufferSize", 256, builder.bufferSize); + } + + @Test + public void testKeywordTokenizerDefaultValues() { + KeywordTokenizer builder = assertAndTokenizer("{type: \"keyword\"}", KeywordTokenizer.class); + assertExactValue("ClassicTokenizer.maxTokenLength", KeywordTokenizer.DEFAULT_BUFFER_SIZE, builder.bufferSize); + } + + @Test(expected = IOException.class) + public void testKeywordTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"keyword\", bufer_size: 256}"); + } + + @Test + public void testLetterTokenizerValidJSON() { + assertAndTokenizer("{type: \"letter\"}", LetterTokenizer.class); + } + + @Test + public void testLowerCaseTokenizerValidJSON() { + assertAndTokenizer("{type: \"lower_case\"}", LowerCaseTokenizer.class); + } + + @Test + public void testThaiTokenizerValidJSON() { + assertAndTokenizer("{type: \"thai\"}", ThaiTokenizer.class); + } + + @Test + public void testNGramTokenizerValidJSON() { + String json = "{type: \"ngram\", min_gram: 1, max_gram: 2}"; + NGramTokenizer builder = assertAndTokenizer(json, NGramTokenizer.class); + assertExactValue("NGramTokenizer.min_gram", NGramTokenizer.DEFAULT_MIN_GRAM, builder.minGram); + assertExactValue("NGramTokenizer.max_gram", NGramTokenizer.DEFAULT_MAX_GRAM, builder.maxGram); + } + + @Test + public void testNGramTokenizerDefaultValues() { + String json = "{type: \"ngram\"}"; + NGramTokenizer builder = assertAndTokenizer(json, NGramTokenizer.class); + assertExactValue("NGramTokenizer.min_gram", NGramTokenizer.DEFAULT_MIN_GRAM, builder.minGram); + assertExactValue("NGramTokenizer.max_gram", NGramTokenizer.DEFAULT_MAX_GRAM, builder.maxGram); + } + + @Test(expected = IOException.class) + public void testNGramTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"ngram\", min_am: 1, max_gram: 1}"); + } + + @Test + public void testEdgeNGramTokenizerValidJSON() { + String json = "{type: \"edge_ngram\", min_gram: 1, max_gram: 2}"; + EdgeNGramTokenizer builder = assertAndTokenizer(json, EdgeNGramTokenizer.class); + assertExactValue("EdgeNGramTokenizer.min_gram", EdgeNGramTokenizer.DEFAULT_MIN_GRAM, builder.minGram); + assertExactValue("EdgeNGramTokenizer.max_gram", EdgeNGramTokenizer.DEFAULT_MAX_GRAM, builder.maxGram); + } + + @Test + public void testEdgeNGramTokenizerDefaultValues() { + String json = "{type: \"edge_ngram\"}"; + EdgeNGramTokenizer builder = assertAndTokenizer(json, EdgeNGramTokenizer.class); + assertExactValue("EdgeNGramTokenizer.min_gram", EdgeNGramTokenizer.DEFAULT_MIN_GRAM, builder.minGram); + assertExactValue("EdgeNGramTokenizer.max_gram", EdgeNGramTokenizer.DEFAULT_MAX_GRAM, builder.maxGram); + } + + @Test(expected = IOException.class) + public void testEdgeNGramTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"edge_ngram\", min_am: 1, max_gram: 1}"); + } + + @Test + public void testPathHierarchyTokenizerValidJSON() { + String json = "{type: \"path_hierarchy\", buffer_size: 246, delimiter: \"$\", replacement: \"%\", skip: 3}"; + PathHierarchyTokenizer builder = assertAndTokenizer(json, PathHierarchyTokenizer.class); + assertExactValue("PathHierarchyTokenizer.buffer_size", 246, builder.bufferSize); + assertExactValue("PathHierarchyTokenizer.delimiter", '$', builder.delimiter); + assertExactValue("PathHierarchyTokenizer.replacement", '%', builder.replacement); + assertExactValue("PathHierarchyTokenizer.skip", 3, builder.skip); + } + + @Test + public void testPathHierarchyTokenizerDefaultValues() { + String json = "{type: \"path_hierarchy\"}"; + PathHierarchyTokenizer builder = assertAndTokenizer(json, PathHierarchyTokenizer.class); + assertExactValue("PathHierarchyTokenizer.buffer_size", + PathHierarchyTokenizer.DEFAULT_BUFFER_SIZE, + builder.bufferSize); + assertExactValue("PathHierarchyTokenizer.delimiter", + PathHierarchyTokenizer.DEFAULT_DELIMITER, + builder.delimiter); + assertExactValue("PathHierarchyTokenizer.replacement", + PathHierarchyTokenizer.DEFAULT_REPLACEMENT, + builder.replacement); + assertExactValue("PathHierarchyTokenizer.skip", PathHierarchyTokenizer.DEFAULT_SKIP, builder.skip); + } + + @Test(expected = IOException.class) + public void testPathHierarchyTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"path_hierarchy\", buffer_size: 246, delimter: \"$\", replacement: \"%\", skip: 3}"); + } + + @Test + public void testPatternTokenizerValidJSON() { + String json = "{type: \"pattern\", pattern: \"[a-z]\", flags: 35, group: 0}"; + PatternTokenizer builder = assertAndTokenizer(json, PatternTokenizer.class); + assertExactValue("PathHierarchyTokenizer.pattern", "[a-z]", builder.pattern); + assertExactValue("PathHierarchyTokenizer.flags", 35, builder.flags); + assertExactValue("PathHierarchyTokenizer.group", 0, builder.group); + } + + @Test + public void testPatternTokenizerDefaultValues() { + String json = "{type: \"pattern\"}"; + PatternTokenizer builder = assertAndTokenizer(json, PatternTokenizer.class); + assertExactValue("PathHierarchyTokenizer.pattern", PatternTokenizer.DEFAULT_PATTERN, builder.pattern); + assertExactValue("PathHierarchyTokenizer.group", PatternTokenizer.DEFAULT_GROUP, builder.group); + assertExactValue("PathHierarchyTokenizer.group", PatternTokenizer.DEFAULT_FLAGS, builder.flags); + } + + @Test(expected = IOException.class) + public void testPatternTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"pattern\", paern: \"[a-z]\", flags: 35, group: 0}"); + } + + @Test + public void testReversePathHierarchyTokenizerValidJSON() { + String + json + = "{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacement: \"%\", skip: 3}"; + ReversePathHierarchyTokenizer builder = assertAndTokenizer(json, ReversePathHierarchyTokenizer.class); + assertExactValue("ReversePathHierarchyTokenizer.buffer_size", 246, builder.bufferSize); + assertExactValue("ReversePathHierarchyTokenizer.delimiter", '/', builder.delimiter); + assertExactValue("ReversePathHierarchyTokenizer.replacement", '%', builder.replacement); + assertExactValue("ReversePathHierarchyTokenizer.skip", 3, builder.skip); + } + + @Test + public void testReversePathHierarchyTokenizerDefaultValues() { + String json = "{type: \"reverse_path_hierarchy\"}"; + ReversePathHierarchyTokenizer builder = assertAndTokenizer(json, ReversePathHierarchyTokenizer.class); + assertExactValue("PathHierarchyTokenizer.buffer_size", + ReversePathHierarchyTokenizer.DEFAULT_BUFFER_SIZE, + builder.bufferSize); + assertExactValue("PathHierarchyTokenizer.delimiter", + ReversePathHierarchyTokenizer.DEFAULT_DELIMITER, + builder.delimiter); + assertExactValue("PathHierarchyTokenizer.replacement", + ReversePathHierarchyTokenizer.DEFAULT_REPLACEMENT, + builder.replacement); + assertExactValue("PathHierarchyTokenizer.skip", ReversePathHierarchyTokenizer.DEFAULT_SKIP, builder.skip); + } + + @Test(expected = IOException.class) + public void testReversePathHierarchyTokenizerInvalidJSON() throws IOException { + assertJsonParseFail( + "{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacent: \"%\", skip: 3}"); + } + + @Test + public void testStandardTokenizerValidJSON() { + String json = "{type: \"standard\", max_token_length: 246}"; + StandardTokenizer builder = assertAndTokenizer(json, StandardTokenizer.class); + assertExactValue("StandardTokenizer.maxTokenLength", 246, builder.maxTokenLength); + } + + @Test + public void testStandardTokenizerDefaultValues() { + StandardTokenizer builder = assertAndTokenizer("{type: \"standard\"}", StandardTokenizer.class); + assertExactValue("ClassicTokenizer.maxTokenLength", + StandardTokenizer.DEFAULT_MAX_TOKEN_LENGTH, + builder.maxTokenLength); + } + + @Test(expected = IOException.class) + public void testStandardTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"standard\", max_token_ngth: 246}"); + } + + @Test + public void testUAX29URLEmailTokenizerValidJSON() { + String json = "{type: \"uax29_url_email\", max_token_length: 249}"; + UAX29URLEmailTokenizer builder = assertAndTokenizer(json, UAX29URLEmailTokenizer.class); + assertExactValue("UAX29URLEmailTokenizer.maxTokenLength", 249, builder.maxTokenLength); + } + + @Test + public void testUAX29URLEmailTokenizerDefaultValues() { + String json = "{type: \"uax29_url_email\"}"; + UAX29URLEmailTokenizer builder = assertAndTokenizer(json, UAX29URLEmailTokenizer.class); + assertExactValue("UAX29URLEmailTokenizer.maxTokenLength", + UAX29URLEmailTokenizer.DEFAULT_MAX_TOKEN_LENGTH, + builder.maxTokenLength); + } + + @Test(expected = IOException.class) + public void testUAX29URLEmailTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"uax29_url_email\", max_token_lgth: 249}"); + } + + @Test + public void testUnicodeWhitespaceTokenizerValidJSON() { + String json = "{type:\"unicode_whitespace\"}"; + assertAndTokenizer(json, UnicodeWhitespaceTokenizer.class); + } + + @Test + public void testWhitespaceTokenizerValidJSON() { + String json = "{type:\"whitespace\"}"; + assertAndTokenizer(json, WhitespaceTokenizer.class); + } + + @Test + public void testWikipediaTokenizerValidJSON() { + String json = "{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untokenized_types : [\"aaa\",\"bbb\"]}"; + WikipediaTokenizer builder = assertAndTokenizer(json, WikipediaTokenizer.class); + assertExactValue("WikipediaTokenizer.token_output", + WikipediaTokenizer.TokenOutputValue.TOKENS_ONLY, + builder.tokenOutput); + assertExactValue("WikipediaTokenizer.untokenized_types", + Sets.newHashSet("aaa", "bbb"), + builder.untokenizedTypes); + } + + @Test + public void testWikipediaTokenizerDefaultValues() { + String json = "{type: \"wikipedia\"}"; + WikipediaTokenizer builder = assertAndTokenizer(json, WikipediaTokenizer.class); + assertExactValue("WikipediaTokenizer.token_output", + WikipediaTokenizer.TokenOutputValue.TOKENS_ONLY, + builder.tokenOutput); + assertExactValue("WikipediaTokenizer.untokenized_types", Sets.newHashSet(), builder.untokenizedTypes); + } + + @Test(expected = IOException.class) + public void testWikipediaTokenizerInvalidJSON() throws IOException { + assertJsonParseFail("{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untoknized_types : [\"aaa\",\"bbb\"]}"); + } + + @Test(expected = IOException.class) + public void testInvalidTokenizerType() throws IOException { + assertJsonParseFail("{type: \"invalid_type\"}"); + } +} \ No newline at end of file diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java index c4e1de701..572bb2afa 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java @@ -26,6 +26,9 @@ */ public class EdgeNGramTokenizerBuilder extends TokenizerBuilder { + static final Integer DEFAULT_MIN_GRAM = 1; + static final Integer DEFAULT_MAX_GRAM = 1; + /** the smallest n-gram to generate */ @JsonProperty("min_gram") final Integer minGram; @@ -43,8 +46,8 @@ public class EdgeNGramTokenizerBuilder extends TokenizerBuilder { + /** + * Builds a new {@link LetterTokenizer}. + */ + @JsonCreator + public LetterTokenizerBuilder() { + } + /** {@inheritDoc} */ @Override public LetterTokenizer buildTokenizer() { diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java index 86cb3fc8e..bf0959608 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java @@ -15,6 +15,7 @@ */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; +import com.fasterxml.jackson.annotation.JsonCreator; import org.apache.lucene.analysis.core.LowerCaseTokenizer; /** @@ -24,6 +25,13 @@ */ public class LowerCaseTokenizerBuilder extends TokenizerBuilder { + /** + * Builds a new {@link LowerCaseTokenizerBuilder} + */ + @JsonCreator + public LowerCaseTokenizerBuilder() { + } + /** {@inheritDoc} */ @Override public LowerCaseTokenizer buildTokenizer() { diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java index f3694d4ae..4d46ba9da 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java @@ -15,6 +15,7 @@ */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; +import com.fasterxml.jackson.annotation.JsonCreator; import org.apache.lucene.analysis.th.ThaiTokenizer; /** @@ -24,6 +25,13 @@ */ public class ThaiTokenizerBuilder extends TokenizerBuilder { + /** + * Builds a new {@link ThaiTokenizerBuilder} + */ + @JsonCreator + public ThaiTokenizerBuilder() { + } + /** {@inheritDoc} */ @Override public ThaiTokenizer buildTokenizer() { diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java index a5f907444..308d7e073 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java @@ -15,6 +15,7 @@ */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; +import com.fasterxml.jackson.annotation.JsonCreator; import org.apache.lucene.analysis.core.WhitespaceTokenizer; /** @@ -24,6 +25,13 @@ */ public class WhitespaceTokenizerBuilder extends TokenizerBuilder { + /** + * Builds a new {@link WhitespaceTokenizerBuilder} + */ + @JsonCreator + public WhitespaceTokenizerBuilder() { + } + /** {@inheritDoc} */ @Override public WhitespaceTokenizer buildTokenizer() { diff --git a/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java b/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java index 3e59bf21d..199d9d2bb 100644 --- a/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java +++ b/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java @@ -28,15 +28,20 @@ public class TokenizerBuilderTest { private T assertBuilderAndTokenizer(String json, Class expectedBuilderClass, Class expectedTokenizerClass) { try { TokenizerBuilder abstractBuilder = JsonSerializer.fromString(json, TokenizerBuilder.class); - assertEquals("Expected " + expectedBuilderClass.getName() + " class", expectedBuilderClass, abstractBuilder.getClass()); + assertEquals("Expected " + expectedBuilderClass.getName() + " class", + expectedBuilderClass, + abstractBuilder.getClass()); Tokenizer tokenizer = abstractBuilder.buildTokenizer(); - assertEquals("Expected " + expectedTokenizerClass.getName() + " class", expectedTokenizerClass, tokenizer.getClass()); + assertEquals("Expected " + expectedTokenizerClass.getName() + " class", + expectedTokenizerClass, + tokenizer.getClass()); return (T) abstractBuilder; } catch (Exception e) { fail(e.getLocalizedMessage()); return null; } } + private void assertJsonParseFail(String json) throws IOException { JsonSerializer.fromString(json, TokenizerBuilder.class); } @@ -45,26 +50,43 @@ private void assertJsonParseFail(String json, String message) { try { JsonSerializer.fromString(json, TokenizerBuilder.class); } catch (IOException e) { - assertEquals("Expected IOException with message: " + message + " but received: " + e.getMessage() + " localMess: " + e.getLocalizedMessage(), message, e.getMessage()); + assertEquals("Expected IOException with message: " + + message + + " but received: " + + e.getMessage() + + " localMess: " + + e.getLocalizedMessage(), message, e.getMessage()); } - assertFalse("Parsing: " + json + " must generate an IOException with message: " + message + " but does not.", true); + assertFalse("Parsing: " + json + " must generate an IOException with message: " + message + " but does not.", + true); } private void assertExactValue(String paramName, Object expected, Object received) { - assertEquals("Expected " + paramName + " equals to " + expected.toString() + " but received: " + received.toString(), expected, received); + assertEquals("Expected " + + paramName + + " equals to " + + expected.toString() + + " but received: " + + received.toString(), expected, received); } @Test public void testClassicTokenizerValidJSON() { String json = "{type: \"classic\", max_token_length: 250}"; - ClassicTokenizerBuilder builder = assertBuilderAndTokenizer(json, ClassicTokenizerBuilder.class, ClassicTokenizer.class); + ClassicTokenizerBuilder builder = assertBuilderAndTokenizer(json, + ClassicTokenizerBuilder.class, + ClassicTokenizer.class); assertExactValue("ClassicTokenizerBuilder.maxTokenLength", 250, builder.maxTokenLength); } @Test public void testClassicTokenizerDefaultValues() { - ClassicTokenizerBuilder builder = assertBuilderAndTokenizer("{type: \"classic\"}", ClassicTokenizerBuilder.class, ClassicTokenizer.class); - assertExactValue("ClassicTokenizerBuilder.maxTokenLength", ClassicTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH, builder.maxTokenLength); + ClassicTokenizerBuilder builder = assertBuilderAndTokenizer("{type: \"classic\"}", + ClassicTokenizerBuilder.class, + ClassicTokenizer.class); + assertExactValue("ClassicTokenizerBuilder.maxTokenLength", + ClassicTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH, + builder.maxTokenLength); } @Test(expected = IOException.class) @@ -75,14 +97,20 @@ public void testClassicTokenizerInvalidParam() throws IOException { @Test public void testKeywordTokenizerValidJSON() { String json = "{type: \"keyword\", buffer_size: 256}"; - KeywordTokenizerBuilder builder = assertBuilderAndTokenizer(json, KeywordTokenizerBuilder.class, KeywordTokenizer.class); + KeywordTokenizerBuilder builder = assertBuilderAndTokenizer(json, + KeywordTokenizerBuilder.class, + KeywordTokenizer.class); assertExactValue("KeywordTokenizer.bufferSize", 256, builder.bufferSize); } @Test public void testKeywordTokenizerDefaultValues() { - KeywordTokenizerBuilder builder = assertBuilderAndTokenizer("{type: \"keyword\"}", KeywordTokenizerBuilder.class, KeywordTokenizer.class); - assertExactValue("ClassicTokenizerBuilder.maxTokenLength", KeywordTokenizerBuilder.DEFAULT_BUFFER_SIZE, builder.bufferSize); + KeywordTokenizerBuilder builder = assertBuilderAndTokenizer("{type: \"keyword\"}", + KeywordTokenizerBuilder.class, + KeywordTokenizer.class); + assertExactValue("ClassicTokenizerBuilder.maxTokenLength", + KeywordTokenizerBuilder.DEFAULT_BUFFER_SIZE, + builder.bufferSize); } @Test(expected = IOException.class) @@ -107,16 +135,20 @@ public void testThaiTokenizerValidJSON() { @Test public void testNGramTokenizerValidJSON() { - String json = "{type: \"ngram\", min_gram: 1, max_gram: 1}"; - NGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, NGramTokenizerBuilder.class, NGramTokenizer.class); - assertExactValue("NGramTokenizerBuilder.min_gram", 1, builder.minGram); - assertExactValue("NGramTokenizerBuilder.max_gram", 1, builder.maxGram); + String json = "{type: \"ngram\", min_gram: 1, max_gram: 2}"; + NGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, + NGramTokenizerBuilder.class, + NGramTokenizer.class); + assertExactValue("NGramTokenizerBuilder.min_gram", NGramTokenizerBuilder.DEFAULT_MIN_GRAM, builder.minGram); + assertExactValue("NGramTokenizerBuilder.max_gram", NGramTokenizerBuilder.DEFAULT_MAX_GRAM, builder.maxGram); } @Test public void testNGramTokenizerDefaultValues() { String json = "{type: \"ngram\"}"; - NGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, NGramTokenizerBuilder.class, NGramTokenizer.class); + NGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, + NGramTokenizerBuilder.class, + NGramTokenizer.class); assertExactValue("NGramTokenizerBuilder.min_gram", NGramTokenizerBuilder.DEFAULT_MIN_GRAM, builder.minGram); assertExactValue("NGramTokenizerBuilder.max_gram", NGramTokenizerBuilder.DEFAULT_MAX_GRAM, builder.maxGram); } @@ -129,17 +161,29 @@ public void testNGramTokenizerInvalidJSON() throws IOException { @Test public void testEdgeNGramTokenizerValidJSON() { String json = "{type: \"edge_ngram\", min_gram: 1, max_gram: 1}"; - EdgeNGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, EdgeNGramTokenizerBuilder.class, EdgeNGramTokenizer.class); - assertExactValue("EdgeNGramTokenizerBuilder.min_gram", 1, builder.minGram); - assertExactValue("EdgeNGramTokenizerBuilder.max_gram", 1, builder.maxGram); + EdgeNGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, + EdgeNGramTokenizerBuilder.class, + EdgeNGramTokenizer.class); + assertExactValue("EdgeNGramTokenizerBuilder.min_gram", + EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, + builder.minGram); + assertExactValue("EdgeNGramTokenizerBuilder.max_gram", + EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE, + builder.maxGram); } @Test public void testEdgeNGramTokenizerDefaultValues() { String json = "{type: \"edge_ngram\"}"; - EdgeNGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, EdgeNGramTokenizerBuilder.class, EdgeNGramTokenizer.class); - assertExactValue("EdgeNGramTokenizerBuilder.min_gram", EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, builder.minGram); - assertExactValue("EdgeNGramTokenizerBuilder.max_gram", EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE, builder.maxGram); + EdgeNGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, + EdgeNGramTokenizerBuilder.class, + EdgeNGramTokenizer.class); + assertExactValue("EdgeNGramTokenizerBuilder.min_gram", + EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, + builder.minGram); + assertExactValue("EdgeNGramTokenizerBuilder.max_gram", + EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE, + builder.maxGram); } @Test(expected = IOException.class) @@ -150,7 +194,9 @@ public void testEdgeNGramTokenizerInvalidJSON() throws IOException { @Test public void testPathHierarchyTokenizerValidJSON() { String json = "{type: \"path_hierarchy\", buffer_size: 246, delimiter: \"$\", replacement: \"%\", skip: 3}"; - PathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, PathHierarchyTokenizerBuilder.class, PathHierarchyTokenizer.class); + PathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, + PathHierarchyTokenizerBuilder.class, + PathHierarchyTokenizer.class); assertExactValue("PathHierarchyTokenizerBuilder.buffer_size", 246, builder.bufferSize); assertExactValue("PathHierarchyTokenizerBuilder.delimiter", '$', builder.delimiter); assertExactValue("PathHierarchyTokenizerBuilder.replacement", '%', builder.replacement); @@ -160,11 +206,21 @@ public void testPathHierarchyTokenizerValidJSON() { @Test public void testPathHierarchyTokenizerDefaultValues() { String json = "{type: \"path_hierarchy\"}"; - PathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, PathHierarchyTokenizerBuilder.class, PathHierarchyTokenizer.class); - assertExactValue("PathHierarchyTokenizerBuilder.buffer_size", PathHierarchyTokenizerBuilder.DEFAULT_BUFFER_SIZE, builder.bufferSize); - assertExactValue("PathHierarchyTokenizerBuilder.delimiter", PathHierarchyTokenizerBuilder.DEFAULT_DELIMITER, builder.delimiter); - assertExactValue("PathHierarchyTokenizerBuilder.replacement", PathHierarchyTokenizerBuilder.DEFAULT_REPLACEMENT, builder.replacement); - assertExactValue("PathHierarchyTokenizerBuilder.skip", PathHierarchyTokenizerBuilder.DEFAULT_SKIP, builder.skip); + PathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, + PathHierarchyTokenizerBuilder.class, + PathHierarchyTokenizer.class); + assertExactValue("PathHierarchyTokenizerBuilder.buffer_size", + PathHierarchyTokenizerBuilder.DEFAULT_BUFFER_SIZE, + builder.bufferSize); + assertExactValue("PathHierarchyTokenizerBuilder.delimiter", + PathHierarchyTokenizerBuilder.DEFAULT_DELIMITER, + builder.delimiter); + assertExactValue("PathHierarchyTokenizerBuilder.replacement", + PathHierarchyTokenizerBuilder.DEFAULT_REPLACEMENT, + builder.replacement); + assertExactValue("PathHierarchyTokenizerBuilder.skip", + PathHierarchyTokenizerBuilder.DEFAULT_SKIP, + builder.skip); } @Test(expected = IOException.class) @@ -175,7 +231,9 @@ public void testPathHierarchyTokenizerInvalidJSON() throws IOException { @Test public void testPatternTokenizerValidJSON() { String json = "{type: \"pattern\", pattern: \"[a-z]\", flags: 35, group: 0}"; - PatternTokenizerBuilder builder = assertBuilderAndTokenizer(json, PatternTokenizerBuilder.class, PatternTokenizer.class); + PatternTokenizerBuilder builder = assertBuilderAndTokenizer(json, + PatternTokenizerBuilder.class, + PatternTokenizer.class); assertExactValue("PathHierarchyTokenizerBuilder.pattern", "[a-z]", builder.pattern); assertExactValue("PathHierarchyTokenizerBuilder.flags", 35, builder.flags); assertExactValue("PathHierarchyTokenizerBuilder.group", 0, builder.group); @@ -184,8 +242,12 @@ public void testPatternTokenizerValidJSON() { @Test public void testPatternTokenizerDefaultValues() { String json = "{type: \"pattern\"}"; - PatternTokenizerBuilder builder = assertBuilderAndTokenizer(json, PatternTokenizerBuilder.class, PatternTokenizer.class); - assertExactValue("PathHierarchyTokenizerBuilder.pattern", PatternTokenizerBuilder.DEFAULT_PATTERN, builder.pattern); + PatternTokenizerBuilder builder = assertBuilderAndTokenizer(json, + PatternTokenizerBuilder.class, + PatternTokenizer.class); + assertExactValue("PathHierarchyTokenizerBuilder.pattern", + PatternTokenizerBuilder.DEFAULT_PATTERN, + builder.pattern); assertExactValue("PathHierarchyTokenizerBuilder.group", PatternTokenizerBuilder.DEFAULT_GROUP, builder.group); assertExactValue("PathHierarchyTokenizerBuilder.group", PatternTokenizerBuilder.DEFAULT_FLAGS, builder.flags); } @@ -197,8 +259,12 @@ public void testPatternTokenizerInvalidJSON() throws IOException { @Test public void testReversePathHierarchyTokenizerValidJSON() { - String json = "{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacement: \"%\", skip: 3}"; - ReversePathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, ReversePathHierarchyTokenizerBuilder.class, ReversePathHierarchyTokenizer.class); + String + json + = "{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacement: \"%\", skip: 3}"; + ReversePathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, + ReversePathHierarchyTokenizerBuilder.class, + ReversePathHierarchyTokenizer.class); assertExactValue("ReversePathHierarchyTokenizerBuilder.buffer_size", 246, builder.bufferSize); assertExactValue("ReversePathHierarchyTokenizerBuilder.delimiter", '/', builder.delimiter); assertExactValue("ReversePathHierarchyTokenizerBuilder.replacement", '%', builder.replacement); @@ -208,29 +274,46 @@ public void testReversePathHierarchyTokenizerValidJSON() { @Test public void testReversePathHierarchyTokenizerDefaultValues() { String json = "{type: \"reverse_path_hierarchy\"}"; - ReversePathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, ReversePathHierarchyTokenizerBuilder.class, ReversePathHierarchyTokenizer.class); - assertExactValue("PathHierarchyTokenizerBuilder.buffer_size", ReversePathHierarchyTokenizerBuilder.DEFAULT_BUFFER_SIZE, builder.bufferSize); - assertExactValue("PathHierarchyTokenizerBuilder.delimiter", ReversePathHierarchyTokenizerBuilder.DEFAULT_DELIMITER, builder.delimiter); - assertExactValue("PathHierarchyTokenizerBuilder.replacement", ReversePathHierarchyTokenizerBuilder.DEFAULT_REPLACEMENT, builder.replacement); - assertExactValue("PathHierarchyTokenizerBuilder.skip", ReversePathHierarchyTokenizerBuilder.DEFAULT_SKIP, builder.skip); + ReversePathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, + ReversePathHierarchyTokenizerBuilder.class, + ReversePathHierarchyTokenizer.class); + assertExactValue("PathHierarchyTokenizerBuilder.buffer_size", + ReversePathHierarchyTokenizerBuilder.DEFAULT_BUFFER_SIZE, + builder.bufferSize); + assertExactValue("PathHierarchyTokenizerBuilder.delimiter", + ReversePathHierarchyTokenizerBuilder.DEFAULT_DELIMITER, + builder.delimiter); + assertExactValue("PathHierarchyTokenizerBuilder.replacement", + ReversePathHierarchyTokenizerBuilder.DEFAULT_REPLACEMENT, + builder.replacement); + assertExactValue("PathHierarchyTokenizerBuilder.skip", + ReversePathHierarchyTokenizerBuilder.DEFAULT_SKIP, + builder.skip); } @Test(expected = IOException.class) public void testReversePathHierarchyTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacent: \"%\", skip: 3}"); + assertJsonParseFail( + "{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacent: \"%\", skip: 3}"); } @Test public void testStandardTokenizerValidJSON() { String json = "{type: \"standard\", max_token_length: 246}"; - StandardTokenizerBuilder builder = assertBuilderAndTokenizer(json, StandardTokenizerBuilder.class, StandardTokenizer.class); + StandardTokenizerBuilder builder = assertBuilderAndTokenizer(json, + StandardTokenizerBuilder.class, + StandardTokenizer.class); assertExactValue("StandardTokenizerBuilder.maxTokenLength", 246, builder.maxTokenLength); } @Test public void testStandardTokenizerDefaultValues() { - StandardTokenizerBuilder builder = assertBuilderAndTokenizer("{type: \"standard\"}", StandardTokenizerBuilder.class, StandardTokenizer.class); - assertExactValue("ClassicTokenizerBuilder.maxTokenLength", StandardTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH, builder.maxTokenLength); + StandardTokenizerBuilder builder = assertBuilderAndTokenizer("{type: \"standard\"}", + StandardTokenizerBuilder.class, + StandardTokenizer.class); + assertExactValue("ClassicTokenizerBuilder.maxTokenLength", + StandardTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH, + builder.maxTokenLength); } @Test(expected = IOException.class) @@ -241,15 +324,21 @@ public void testStandardTokenizerInvalidJSON() throws IOException { @Test public void testUAX29URLEmailTokenizerValidJSON() { String json = "{type: \"uax29_url_email\", max_token_length: 249}"; - UAX29URLEmailTokenizerBuilder builder = assertBuilderAndTokenizer(json, UAX29URLEmailTokenizerBuilder.class, UAX29URLEmailTokenizer.class); + UAX29URLEmailTokenizerBuilder builder = assertBuilderAndTokenizer(json, + UAX29URLEmailTokenizerBuilder.class, + UAX29URLEmailTokenizer.class); assertExactValue("UAX29URLEmailTokenizerBuilder.maxTokenLength", 249, builder.maxTokenLength); } @Test public void testUAX29URLEmailTokenizerDefaultValues() { String json = "{type: \"uax29_url_email\"}"; - UAX29URLEmailTokenizerBuilder builder = assertBuilderAndTokenizer(json, UAX29URLEmailTokenizerBuilder.class, UAX29URLEmailTokenizer.class); - assertExactValue("UAX29URLEmailTokenizerBuilder.maxTokenLength", UAX29URLEmailTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH, builder.maxTokenLength); + UAX29URLEmailTokenizerBuilder builder = assertBuilderAndTokenizer(json, + UAX29URLEmailTokenizerBuilder.class, + UAX29URLEmailTokenizer.class); + assertExactValue("UAX29URLEmailTokenizerBuilder.maxTokenLength", + UAX29URLEmailTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH, + builder.maxTokenLength); } @Test(expected = IOException.class) @@ -272,16 +361,26 @@ public void testWhitespaceTokenizerValidJSON() { @Test public void testWikipediaTokenizerValidJSON() { String json = "{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untokenized_types : [\"aaa\",\"bbb\"]}"; - WikipediaTokenizerBuilder builder = assertBuilderAndTokenizer(json, WikipediaTokenizerBuilder.class, WikipediaTokenizer.class); - assertExactValue("WikipediaTokenizerBuilder.token_output", WikipediaTokenizerBuilder.TokenOutputValue.TOKENS_ONLY, builder.tokenOutput); - assertExactValue("WikipediaTokenizerBuilder.untokenized_types", Sets.newHashSet("aaa", "bbb"), builder.untokenizedTypes); + WikipediaTokenizerBuilder builder = assertBuilderAndTokenizer(json, + WikipediaTokenizerBuilder.class, + WikipediaTokenizer.class); + assertExactValue("WikipediaTokenizerBuilder.token_output", + WikipediaTokenizerBuilder.TokenOutputValue.TOKENS_ONLY, + builder.tokenOutput); + assertExactValue("WikipediaTokenizerBuilder.untokenized_types", + Sets.newHashSet("aaa", "bbb"), + builder.untokenizedTypes); } @Test public void testWikipediaTokenizerDefaultValues() { String json = "{type: \"wikipedia\"}"; - WikipediaTokenizerBuilder builder = assertBuilderAndTokenizer(json, WikipediaTokenizerBuilder.class, WikipediaTokenizer.class); - assertExactValue("WikipediaTokenizerBuilder.token_output", WikipediaTokenizerBuilder.TokenOutputValue.TOKENS_ONLY, builder.tokenOutput); + WikipediaTokenizerBuilder builder = assertBuilderAndTokenizer(json, + WikipediaTokenizerBuilder.class, + WikipediaTokenizer.class); + assertExactValue("WikipediaTokenizerBuilder.token_output", + WikipediaTokenizerBuilder.TokenOutputValue.TOKENS_ONLY, + builder.tokenOutput); assertExactValue("WikipediaTokenizerBuilder.untokenized_types", Sets.newHashSet(), builder.untokenizedTypes); } @@ -294,4 +393,5 @@ public void testWikipediaTokenizerInvalidJSON() throws IOException { public void testInvalidTokenizerType() throws IOException { assertJsonParseFail("{type: \"invalid_type\"}"); } + } From 491987b0dfc42ca0aeee86ee3eb446fc95054fb1 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Tue, 16 May 2017 14:10:21 +0200 Subject: [PATCH 05/40] Scala refactor in tokenizer feature --- .../cassandra/lucene/builder/Builder.java | 12 + .../index/schema/analysis/Analyzer.java | 3 +- .../index/schema/analysis/CustomAnalyzer.java | 28 + .../analysis/tokenizer/ClassicTokenizer.java | 8 + .../tokenizer/EdgeNGramTokenizer.java | 9 + .../analysis/tokenizer/KeywordTokenizer.java | 5 + .../analysis/tokenizer/NGramTokenizer.java | 10 + .../tokenizer/PathHierarchyTokenizer.java | 11 + .../analysis/tokenizer/PatternTokenizer.java | 10 + .../ReversePathHierarchyTokenizer.java | 12 + .../analysis/tokenizer/StandardTokenizer.java | 9 + .../tokenizer/UAX29URLEmailTokenizer.java | 10 + .../tokenizer/WikipediaTokenizer.java | 10 + plugin/pom.xml | 5 + .../schema/analysis/CustomAnalyzer.java | 5 +- .../analysis/CustomAnalyzerBuilder.java | 3 +- .../tokenizer/ClassicTokenizerBuilder.java | 36 - .../tokenizer/EdgeNGramTokenizerBuilder.java | 43 -- .../tokenizer/KeywordTokenizerBuilder.java | 34 - .../tokenizer/LetterTokenizerBuilder.java | 23 - .../tokenizer/LowerCaseTokenizerBuilder.java | 24 - .../tokenizer/NGramTokenizerBuilder.java | 42 -- .../PathHierarchyTokenizerBuilder.java | 59 -- .../tokenizer/PatternTokenizerBuilder.java | 51 -- .../ReversePathHierarchyTokenizerBuilder.java | 58 -- .../tokenizer/StandardTokenizerBuilder.java | 35 - .../tokenizer/ThaiTokenizerBuilder.java | 24 - .../analysis/tokenizer/TokenizerBuilder.java | 47 -- .../UAX29URLEmailTokenizerBuilder.java | 35 - .../UnicodeWhitespaceTokenizerBuilder.java | 16 - .../tokenizer/WhitespaceTokenizerBuilder.java | 24 - .../tokenizer/WikipediaTokenizerBuilder.java | 81 -- .../tokenizer/ClassicTokenizerBuilder.scala | 44 ++ .../tokenizer/EdgeNGramTokenizerBuilder.scala | 29 + .../tokenizer/KeywordTokenizerBuilder.scala | 24 + .../tokenizer/LetterTokenizerBuilder.scala | 19 + .../tokenizer/LowerCaseTokenizerBuilder.scala | 18 + .../tokenizer/NGramTokenizerBuilder.scala | 28 + .../PathHierarchyTokenizerBuilder.scala | 34 + .../tokenizer/PatternTokenizerBuilder.scala | 33 + ...ReversePathHierarchyTokenizerBuilder.scala | 34 + .../tokenizer/StandardTokenizerBuilder.scala | 26 + .../tokenizer/ThaiTokenizerBuilder.scala | 17 + .../analysis/tokenizer/TokenizerBuilder.scala | 60 ++ .../UAX29URLEmailTokenizerBuilder.scala | 26 + .../UnicodeWhitespaceTokenizerBuilder.scala | 17 + .../WhitespaceTokenizerBuilder.scala | 17 + .../tokenizer/WikipediaTokenizerBuilder.scala | 37 + .../tokenizer/TokenizerBuilderTest.java | 397 ---------- .../tokenizer/TokenizerBuilderTest.scala | 248 +++++++ .../tokenizer/TokenizerBuilderIT.java | 695 ++++++++++++++++++ 51 files changed, 1550 insertions(+), 1035 deletions(-) create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.scala delete mode 100644 plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java create mode 100644 plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.scala create mode 100644 testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java index f598c9690..7ff7134df 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java @@ -21,7 +21,9 @@ import com.stratio.cassandra.lucene.builder.index.Partitioner; import com.stratio.cassandra.lucene.builder.index.schema.Schema; import com.stratio.cassandra.lucene.builder.index.schema.analysis.ClasspathAnalyzer; +import com.stratio.cassandra.lucene.builder.index.schema.analysis.CustomAnalyzer; import com.stratio.cassandra.lucene.builder.index.schema.analysis.SnowballAnalyzer; +import com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer.Tokenizer; import com.stratio.cassandra.lucene.builder.index.schema.mapping.*; import com.stratio.cassandra.lucene.builder.search.Search; import com.stratio.cassandra.lucene.builder.search.condition.*; @@ -253,6 +255,16 @@ public static SnowballAnalyzer snowballAnalyzer(String language) { return new SnowballAnalyzer(language); } + /** + * Returns a new {@link SnowballAnalyzer} for the specified language and stopwords. + * + * @param tokenizer + * @return a new custom analyzer + */ + public static CustomAnalyzer customAnalyzer(Tokenizer tokenizer) { + return new CustomAnalyzer(tokenizer); + } + /** * Returns a new {@link Search}. * diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/Analyzer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/Analyzer.java index 196c6ed1a..04c0ca4e7 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/Analyzer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/Analyzer.java @@ -26,6 +26,7 @@ */ @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type") @JsonSubTypes({@JsonSubTypes.Type(value = ClasspathAnalyzer.class, name = "classpath"), - @JsonSubTypes.Type(value = SnowballAnalyzer.class, name = "snowball")}) + @JsonSubTypes.Type(value = SnowballAnalyzer.class, name = "snowball"), + @JsonSubTypes.Type(value = CustomAnalyzer.class, name = "custom")}) public abstract class Analyzer extends JSONBuilder { } diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java new file mode 100644 index 000000000..8056b3d02 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java @@ -0,0 +1,28 @@ +package com.stratio.cassandra.lucene.builder.index.schema.analysis; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer.Tokenizer; + +/** + * {@link Analyzer} using a Lucene's {@code Analyzer}s in classpath. + * + * It's uses the {@code Analyzer}'s default (no args) constructor. + * + * @author Juan Pedro Gilaberte {@literal } + */ +public class CustomAnalyzer extends Analyzer{ + + @JsonProperty("tokenizer") + private final Tokenizer tokenizer; + + /** + * Builds a new {@link CustomAnalyzer} using custom tokenizer, char_filters and token_filters. + * + * @param tokenizer an {@link Tokenizer} the tookenizer to use. + */ + @JsonCreator + public CustomAnalyzer(@JsonProperty("tokenizer") Tokenizer tokenizer) { + this.tokenizer = tokenizer; + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ClassicTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ClassicTokenizer.java index a73383193..89ea9ae93 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ClassicTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ClassicTokenizer.java @@ -33,6 +33,14 @@ public class ClassicTokenizer extends Tokenizer { @JsonProperty("max_token_length") final Integer maxTokenLength; + /** + * Builds a new {@link ClassicTokenizer} using the DEFAULT_MAX_TOKEN_LENGTH. + */ + @JsonCreator + public ClassicTokenizer() { + this.maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + } + /** * Builds a new {@link ClassicTokenizer} using the specified maxTokenLength. * diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/EdgeNGramTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/EdgeNGramTokenizer.java index 1eec2ec05..cd85e0bc1 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/EdgeNGramTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/EdgeNGramTokenizer.java @@ -37,6 +37,15 @@ public class EdgeNGramTokenizer extends Tokenizer { @JsonProperty("max_gram") final Integer maxGram; + /** + * Builds a new {@link EdgeNGramTokenizer} using the default minGram and manGram. + */ + @JsonCreator + public EdgeNGramTokenizer() { + this.minGram = DEFAULT_MIN_GRAM; + this.maxGram = DEFAULT_MAX_GRAM; + } + /** * Builds a new {@link EdgeNGramTokenizer} using the specified minGram and manGram. * diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java index 263bf9169..dc4e563d8 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java @@ -33,6 +33,11 @@ public class KeywordTokenizer extends Tokenizer { @JsonProperty("buffer_size") final Integer bufferSize; + @JsonCreator + public KeywordTokenizer() { + this.bufferSize = DEFAULT_BUFFER_SIZE; + } + /** * Builds a new {@link KeywordTokenizer} using the specified buffer_size. * diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java index f10abf600..004e44c09 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java @@ -38,6 +38,16 @@ public class NGramTokenizer extends Tokenizer { @JsonProperty("max_gram") final Integer maxGram; + /** + * Builds a new {@link NGramTokenizer} using the default minGram and manGram. + * + */ + @JsonCreator + public NGramTokenizer() { + this.minGram = DEFAULT_MIN_GRAM; + this.maxGram = DEFAULT_MAX_GRAM; + } + /** * Builds a new {@link NGramTokenizer} using the specified minGram and manGram. * diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java index bbb3419fe..67c4f26ae 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java @@ -50,6 +50,17 @@ public class PathHierarchyTokenizer extends Tokenizer { /** * Builds a new {@link PathHierarchyTokenizer} using the specified bufferSize, delimiter, replacement and skip. + */ + @JsonCreator + public PathHierarchyTokenizer() { + this.bufferSize = DEFAULT_BUFFER_SIZE; + this.delimiter = DEFAULT_DELIMITER; + this.replacement = DEFAULT_REPLACEMENT; + this.skip = DEFAULT_SKIP; + } + + /** + * Builds a new {@link PathHierarchyTokenizer} using the default bufferSize, delimiter, replacement and skip. * * @param bufferSize terms cache read buffer size * @param delimiter path separator diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java index 28723c857..d8d1fc7ca 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java @@ -43,6 +43,16 @@ public class PatternTokenizer extends Tokenizer { @JsonProperty("group") final Integer group; + /** + * Builds a new {@link PatternTokenizer} using the default pattern, flags, and group. + */ + @JsonCreator + public PatternTokenizer() { + this.pattern = DEFAULT_PATTERN; + this.flags = DEFAULT_FLAGS; + this.group = DEFAULT_GROUP; + } + /** * Builds a new {@link PatternTokenizer} using the specified pattern, flags, and group. * diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ReversePathHierarchyTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ReversePathHierarchyTokenizer.java index 5d0652fca..cfb1fe051 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ReversePathHierarchyTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ReversePathHierarchyTokenizer.java @@ -48,6 +48,18 @@ public class ReversePathHierarchyTokenizer extends Tokenizer { @JsonProperty("skip") final Integer skip; + /** + * Builds a new {@link ReversePathHierarchyTokenizer} using the default bufferSize, delimiter, replacement and + * skip. + */ + @JsonCreator + public ReversePathHierarchyTokenizer() { + this.bufferSize = DEFAULT_BUFFER_SIZE; + this.delimiter = DEFAULT_DELIMITER; + this.replacement = DEFAULT_REPLACEMENT; + this.skip = DEFAULT_SKIP; + } + /** * Builds a new {@link ReversePathHierarchyTokenizer} using the specified bufferSize, delimiter, replacement and * skip. diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/StandardTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/StandardTokenizer.java index cbdd65205..5cfe6569a 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/StandardTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/StandardTokenizer.java @@ -33,6 +33,15 @@ public class StandardTokenizer extends Tokenizer { @JsonProperty("max_token_length") final Integer maxTokenLength; + /** + * Builds a new {@link StandardTokenizer} using the default bufferSize, delimiter, replacement and skip. + * + */ + @JsonCreator + public StandardTokenizer() { + this.maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + } + /** * Builds a new {@link StandardTokenizer} using the specified bufferSize, delimiter, replacement and skip. * diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UAX29URLEmailTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UAX29URLEmailTokenizer.java index e9d485faf..349b0fe8b 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UAX29URLEmailTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UAX29URLEmailTokenizer.java @@ -33,6 +33,16 @@ public class UAX29URLEmailTokenizer extends Tokenizer { @JsonProperty("max_token_length") final Integer maxTokenLength; + /** + * Builds a new {@link UAX29URLEmailTokenizer} using the default maxTokenLength. + * + */ + @JsonCreator + public UAX29URLEmailTokenizer() { + this.maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + + } + /** * Builds a new {@link UAX29URLEmailTokenizer} using the specified maxTokenLength. * diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java index 78792d88d..3f28715b1 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java @@ -80,6 +80,16 @@ public String getStringValue() { @JsonProperty("untokenized_types") final Set untokenizedTypes; + /** + * Builds a new {@link WikipediaTokenizer} using the default tokenOutput and untokenizedTypes. + * + */ + @JsonCreator + public WikipediaTokenizer() { + this.tokenOutput = DEFAULT_TOKEN_OUTPUT; + this.untokenizedTypes = DEFAULT_UNTOKENIZED_TYPES; + } + /** * Builds a new {@link WikipediaTokenizer} using the specified tokenOutput and untokenizedTypes. * diff --git a/plugin/pom.xml b/plugin/pom.xml index 92caebfdb..95603a461 100644 --- a/plugin/pom.xml +++ b/plugin/pom.xml @@ -97,6 +97,11 @@ jackson-databind ${jackson.version} + + com.fasterxml.jackson.module + jackson-module-scala_2.12 + 2.8.8 + org.mockito mockito-all diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java index 8f6dda451..fb9bc2e3b 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java @@ -20,10 +20,7 @@ public CustomAnalyzer(Tokenizer tokenizer) { @Override protected Analyzer.TokenStreamComponents createComponents(String fieldName) { - - TokenStream ts = tokenizer; - - return new TokenStreamComponents(tokenizer, ts); + return new TokenStreamComponents(tokenizer); } diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzerBuilder.java index 9bee8b3ae..74bc236e1 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzerBuilder.java @@ -23,6 +23,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.stratio.cassandra.lucene.schema.analysis.tokenizer.TokenizerBuilder; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Tokenizer; /** * {@link AnalyzerBuilder} for building {@link Analyzer}s based on an advanced configuration. @@ -45,7 +46,7 @@ public CustomAnalyzerBuilder(@JsonProperty("tokenizer") TokenizerBuilder tokeniz /** {@inheritDoc} */ @Override public Analyzer analyzer() { - return new CustomAnalyzer(tokenizer.buildTokenizer()); + return new CustomAnalyzer((Tokenizer)tokenizer.buildTokenizer()); } } diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.java index ebf5855c4..18edfcd00 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.java @@ -14,39 +14,3 @@ * limitations under the License. */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.lucene.analysis.standard.ClassicTokenizer; - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.standard.ClassicTokenizer} - * - * @author Eduardo Alonso {@literal } - */ -public class ClassicTokenizerBuilder extends TokenizerBuilder { - - static final Integer DEFAULT_MAX_TOKEN_LENGTH = 255; - - /** If a token length is bigger that this, token is split at max token length intervals. */ - @JsonProperty("max_token_length") - final Integer maxTokenLength; - - /** - * Builds a new {@link ClassicTokenizerBuilder} using the specified maxTokenLength. - * - * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. - */ - @JsonCreator - public ClassicTokenizerBuilder(@JsonProperty("max_token_length") Integer maxTokenLength) { - this.maxTokenLength = getOrDefault(maxTokenLength, DEFAULT_MAX_TOKEN_LENGTH); - } - - /** {@inheritDoc} */ - @Override - public ClassicTokenizer buildTokenizer() { - ClassicTokenizer tokenizer = new ClassicTokenizer(); - tokenizer.setMaxTokenLength(maxTokenLength); - return tokenizer; - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java index 572bb2afa..18edfcd00 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java @@ -14,46 +14,3 @@ * limitations under the License. */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; - -/** - * A {@link EdgeNGramTokenizerBuilder} for building {@link org.apache.lucene.analysis.ngram.EdgeNGramTokenizer} - * - * @author Juan Pedro Gilaberte {@literal } - */ -public class EdgeNGramTokenizerBuilder extends TokenizerBuilder { - - static final Integer DEFAULT_MIN_GRAM = 1; - static final Integer DEFAULT_MAX_GRAM = 1; - - /** the smallest n-gram to generate */ - @JsonProperty("min_gram") - final Integer minGram; - - /** the largest n-gram to generate */ - @JsonProperty("max_gram") - final Integer maxGram; - - /** - * Builds a new {@link EdgeNGramTokenizerBuilder} using the specified minGram and manGram. - * - * @param minGram the smallest n-gram to generate - * @param minGram the largest n-gram to generate - */ - @JsonCreator - public EdgeNGramTokenizerBuilder(@JsonProperty("min_gram") Integer minGram, - @JsonProperty("max_gram") Integer maxGram) { - this.minGram = getOrDefault(minGram, DEFAULT_MIN_GRAM); - this.maxGram = getOrDefault(maxGram, DEFAULT_MAX_GRAM); - } - - /** {@inheritDoc} */ - @Override - public EdgeNGramTokenizer buildTokenizer() { - return new EdgeNGramTokenizer(minGram, maxGram); - } - -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.java index 687ba9c2b..18edfcd00 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.java @@ -14,37 +14,3 @@ * limitations under the License. */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.lucene.analysis.core.KeywordTokenizer; - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.core.KeywordTokenizer} - * - * @author Eduardo Alonso {@literal } - */ -public class KeywordTokenizerBuilder extends TokenizerBuilder { - - static final Integer DEFAULT_BUFFER_SIZE = 256; - - /** terms cache read buffer size */ - @JsonProperty("buffer_size") - final Integer bufferSize; - - /** - * Builds a new {@link KeywordTokenizerBuilder} using the specified buffer_size. - * - * @param bufferSize the terms cache read buffer size - */ - @JsonCreator - public KeywordTokenizerBuilder(@JsonProperty("buffer_size") Integer bufferSize) { - this.bufferSize = getOrDefault(bufferSize, DEFAULT_BUFFER_SIZE); - } - - /** {@inheritDoc} */ - @Override - public KeywordTokenizer buildTokenizer() { - return new KeywordTokenizer(bufferSize); - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.java index ff82a1c24..9f3308381 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.java @@ -15,26 +15,3 @@ */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; -import com.fasterxml.jackson.annotation.JsonCreator; -import org.apache.lucene.analysis.core.LetterTokenizer; - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.core.LetterTokenizer} - * - * @author Eduardo Alonso {@literal } - */ -public class LetterTokenizerBuilder extends TokenizerBuilder { - - /** - * Builds a new {@link LetterTokenizer}. - */ - @JsonCreator - public LetterTokenizerBuilder() { - } - - /** {@inheritDoc} */ - @Override - public LetterTokenizer buildTokenizer() { - return new LetterTokenizer(); - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java index bf0959608..18edfcd00 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java @@ -14,27 +14,3 @@ * limitations under the License. */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - -import com.fasterxml.jackson.annotation.JsonCreator; -import org.apache.lucene.analysis.core.LowerCaseTokenizer; - -/** - * A {@link LowerCaseTokenizerBuilder} for building {@link org.apache.lucene.analysis.core.LowerCaseTokenizer} - * - * @author Juan Pedro Gilaberte {@literal } - */ -public class LowerCaseTokenizerBuilder extends TokenizerBuilder { - - /** - * Builds a new {@link LowerCaseTokenizerBuilder} - */ - @JsonCreator - public LowerCaseTokenizerBuilder() { - } - - /** {@inheritDoc} */ - @Override - public LowerCaseTokenizer buildTokenizer() { - return new LowerCaseTokenizer(); - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.java index 4da9dd300..18edfcd00 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.java @@ -14,45 +14,3 @@ * limitations under the License. */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.lucene.analysis.ngram.NGramTokenizer; - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.ngram.NGramTokenizer} - * - * @author Eduardo Alonso {@literal } - */ -public class NGramTokenizerBuilder extends TokenizerBuilder { - - static final Integer DEFAULT_MIN_GRAM = 1; - static final Integer DEFAULT_MAX_GRAM = 2; - - /** the smallest n-gram to generate */ - @JsonProperty("min_gram") - final Integer minGram; - - /** the largest n-gram to generate */ - @JsonProperty("max_gram") - final Integer maxGram; - - /** - * Builds a new {@link NGramTokenizerBuilder} using the specified minGram and manGram. - * - * @param minGram the smallest n-gram to generate - * @param minGram the largest n-gram to generate - */ - @JsonCreator - public NGramTokenizerBuilder(@JsonProperty("min_gram") Integer minGram, - @JsonProperty("max_gram") Integer maxGram) { - this.minGram = getOrDefault(minGram, DEFAULT_MIN_GRAM); - this.maxGram = getOrDefault(maxGram, DEFAULT_MAX_GRAM); - } - - /** {@inheritDoc} */ - @Override - public NGramTokenizer buildTokenizer() { - return new NGramTokenizer(minGram, maxGram); - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java index 59c8169d4..18edfcd00 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java @@ -14,62 +14,3 @@ * limitations under the License. */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.lucene.analysis.path.PathHierarchyTokenizer; - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.path.PathHierarchyTokenizer} - * - * @author Eduardo Alonso {@literal } - */ -public class PathHierarchyTokenizerBuilder extends TokenizerBuilder { - - static final Integer DEFAULT_BUFFER_SIZE = 1024; - static final Character DEFAULT_DELIMITER = '/'; - static final Character DEFAULT_REPLACEMENT = '/'; - static final Integer DEFAULT_SKIP = 0; - - /** terms cache read buffer size */ - @JsonProperty("buffer_size") - final Integer bufferSize; - - /** path separator */ - @JsonProperty("delimiter") - final Character delimiter; - - /** a replacement character for delimiter */ - @JsonProperty("replacement") - final Character replacement; - - /** number of initial tokens to skip */ - @JsonProperty("skip") - final Integer skip; - - /** - * Builds a new {@link PathHierarchyTokenizerBuilder} using the specified bufferSize, delimiter, replacement and - * skip. - * - * @param bufferSize terms cache read buffer size - * @param delimiter path separator - * @param replacement a replacement character for delimiter - * @param skip number of initial tokens to skip - */ - @JsonCreator - public PathHierarchyTokenizerBuilder(@JsonProperty("buffer_size") Integer bufferSize, - @JsonProperty("delimiter") Character delimiter, - @JsonProperty("replacement") Character replacement, - @JsonProperty("skip") Integer skip) { - this.bufferSize = getOrDefault(bufferSize, DEFAULT_BUFFER_SIZE); - this.delimiter = getOrDefault(delimiter, DEFAULT_DELIMITER); - this.replacement = getOrDefault(replacement, DEFAULT_REPLACEMENT); - this.skip = getOrDefault(skip, DEFAULT_SKIP); - } - - /** {@inheritDoc} */ - @Override - public PathHierarchyTokenizer buildTokenizer() { - return new PathHierarchyTokenizer(bufferSize, delimiter, replacement, skip); - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.java index ca650a96f..9f3308381 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.java @@ -15,54 +15,3 @@ */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.lucene.analysis.pattern.PatternTokenizer; - -import java.util.regex.Pattern; - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.pattern.PatternTokenizer} - * - * @author Eduardo Alonso {@literal } - */ -public class PatternTokenizerBuilder extends TokenizerBuilder { - - static final String DEFAULT_PATTERN = "\\W+"; - static final Integer DEFAULT_FLAGS = 0; - static final Integer DEFAULT_GROUP = -1; - - /** java regular expression */ - @JsonProperty("pattern") - final String pattern; - - /** java regular expression flags */ - @JsonProperty("flags") - final Integer flags; - - /** which pattern group to use to generate tokens (-1 for split) */ - @JsonProperty("group") - final Integer group; - - /** - * Builds a new {@link PatternTokenizerBuilder} using the specified pattern, flags, and group. - * - * @param pattern java regular expression - * @param flags java regular expression flags - * @param group a pattern group to use to generate tokens (-1 for split) - */ - @JsonCreator - public PatternTokenizerBuilder(@JsonProperty("pattern") String pattern, - @JsonProperty("flags") Integer flags, - @JsonProperty("group") Integer group) { - this.pattern = getOrDefault(pattern, DEFAULT_PATTERN); - this.flags = getOrDefault(flags, DEFAULT_FLAGS); - this.group = getOrDefault(group, DEFAULT_GROUP); - } - - /** {@inheritDoc} */ - @Override - public PatternTokenizer buildTokenizer() { - return new PatternTokenizer(Pattern.compile(pattern, flags), group); - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.java index d2ae11c70..9f3308381 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.java @@ -15,61 +15,3 @@ */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer} - * - * @author Eduardo Alonso {@literal } - */ -public class ReversePathHierarchyTokenizerBuilder extends TokenizerBuilder { - - static final Integer DEFAULT_BUFFER_SIZE = 1024; - static final Character DEFAULT_DELIMITER = '/'; - static final Character DEFAULT_REPLACEMENT = '/'; - static final Integer DEFAULT_SKIP = 0; - - /** terms cache read buffer size */ - @JsonProperty("buffer_size") - final Integer bufferSize; - - /** path separator */ - @JsonProperty("delimiter") - final Character delimiter; - - /** a replacement character for delimiter */ - @JsonProperty("replacement") - final Character replacement; - - /** number of initial tokens to skip */ - @JsonProperty("skip") - final Integer skip; - - /** - * Builds a new {@link ReversePathHierarchyTokenizerBuilder} using the specified bufferSize, delimiter, replacement - * and skip. - * - * @param bufferSize terms cache read buffer size - * @param delimiter path separator - * @param replacement a replacement character for delimiter - * @param skip number of initial tokens to skip - */ - @JsonCreator - public ReversePathHierarchyTokenizerBuilder(@JsonProperty("buffer_size") Integer bufferSize, - @JsonProperty("delimiter") Character delimiter, - @JsonProperty("replacement") Character replacement, - @JsonProperty("skip") Integer skip) { - this.bufferSize = getOrDefault(bufferSize, DEFAULT_BUFFER_SIZE); - this.delimiter = getOrDefault(delimiter, DEFAULT_DELIMITER); - this.replacement = getOrDefault(replacement, DEFAULT_REPLACEMENT); - this.skip = getOrDefault(skip, DEFAULT_SKIP); - } - - /** {@inheritDoc} */ - @Override - public ReversePathHierarchyTokenizer buildTokenizer() { - return new ReversePathHierarchyTokenizer(bufferSize, delimiter, replacement, skip); - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.java index 5faa1f480..9f3308381 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.java @@ -15,38 +15,3 @@ */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.lucene.analysis.standard.StandardTokenizer; - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.standard.StandardTokenizer} - * - * @author Eduardo Alonso {@literal } - */ -public class StandardTokenizerBuilder extends TokenizerBuilder { - - static final Integer DEFAULT_MAX_TOKEN_LENGTH = 255; - - /** If a token length is bigger that this, token is split at max token length intervals. */ - @JsonProperty("max_token_length") - final Integer maxTokenLength; - - /** - * Builds a new {@link StandardTokenizerBuilder} using the specified bufferSize, delimiter, replacement and skip. - * - * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. - */ - @JsonCreator - public StandardTokenizerBuilder(@JsonProperty("max_token_length") Integer maxTokenLength) { - this.maxTokenLength = getOrDefault(maxTokenLength, DEFAULT_MAX_TOKEN_LENGTH); - } - - /** {@inheritDoc} */ - @Override - public StandardTokenizer buildTokenizer() { - StandardTokenizer tokenizer = new StandardTokenizer(); - tokenizer.setMaxTokenLength(maxTokenLength); - return tokenizer; - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java index 4d46ba9da..18edfcd00 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java @@ -14,27 +14,3 @@ * limitations under the License. */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - -import com.fasterxml.jackson.annotation.JsonCreator; -import org.apache.lucene.analysis.th.ThaiTokenizer; - -/** - * A {@link ThaiTokenizer} for building {@link org.apache.lucene.analysis.th.ThaiTokenizer} - * - * @author Juan Pedro Gilaberte {@literal } - */ -public class ThaiTokenizerBuilder extends TokenizerBuilder { - - /** - * Builds a new {@link ThaiTokenizerBuilder} - */ - @JsonCreator - public ThaiTokenizerBuilder() { - } - - /** {@inheritDoc} */ - @Override - public ThaiTokenizer buildTokenizer() { - return new ThaiTokenizer(); - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java index 0cc0b3e2f..18edfcd00 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java @@ -14,50 +14,3 @@ * limitations under the License. */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - -import com.fasterxml.jackson.annotation.JsonSubTypes; -import com.fasterxml.jackson.annotation.JsonTypeInfo; -import org.apache.lucene.analysis.Tokenizer; - -/** - * @author Eduardo Alonso {@literal } - */ -@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type") -@JsonSubTypes({@JsonSubTypes.Type(value = ClassicTokenizerBuilder.class, name = "classic"), - @JsonSubTypes.Type(value = EdgeNGramTokenizerBuilder.class, name = "edge_ngram"), - @JsonSubTypes.Type(value = KeywordTokenizerBuilder.class, name = "keyword"), - @JsonSubTypes.Type(value = LetterTokenizerBuilder.class, name = "letter"), - @JsonSubTypes.Type(value = LowerCaseTokenizerBuilder.class, name = "lower_case"), - @JsonSubTypes.Type(value = NGramTokenizerBuilder.class, name = "ngram"), - @JsonSubTypes.Type(value = PathHierarchyTokenizerBuilder.class, name = "path_hierarchy"), - @JsonSubTypes.Type(value = PatternTokenizerBuilder.class, name = "pattern"), - @JsonSubTypes.Type(value = ReversePathHierarchyTokenizerBuilder.class, name = "reverse_path_hierarchy"), - @JsonSubTypes.Type(value = StandardTokenizerBuilder.class, name = "standard"), - @JsonSubTypes.Type(value = UAX29URLEmailTokenizerBuilder.class, name = "uax29_url_email"), - @JsonSubTypes.Type(value = UnicodeWhitespaceTokenizerBuilder.class, name = "unicode_whitespace"), - @JsonSubTypes.Type(value = ThaiTokenizerBuilder.class, name = "thai"), - @JsonSubTypes.Type(value = WhitespaceTokenizerBuilder.class, name = "whitespace"), - @JsonSubTypes.Type(value = WikipediaTokenizerBuilder.class, name = "wikipedia")}) -public abstract class TokenizerBuilder { - - /** - * Gets or creates the Lucene {@link Tokenizer}. - * - * @return the built analyzer - */ - public abstract T buildTokenizer(); - - /** - * @param param the main parameter. - * @param defaultParam the default parameter if main paramaeter is null. - * @param return type must extend {@link Tokenizer} - * @return if (param!=null) { return param; }else{ return defaultParam; } - */ - public static T getOrDefault(T param, T defaultParam) { - if (param == null) { - return defaultParam; - } else { - return param; - } - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java index f5964fa1e..65b467cfd 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java @@ -15,39 +15,4 @@ */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer; -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer} - * - * @author Eduardo Alonso {@literal } - */ -public class UAX29URLEmailTokenizerBuilder extends TokenizerBuilder { - - static final Integer DEFAULT_MAX_TOKEN_LENGTH = 255; - - /** If a token length is bigger that this, token is split at max token length intervals. */ - @JsonProperty("max_token_length") - final Integer maxTokenLength; - - /** - * Builds a new {@link UAX29URLEmailTokenizerBuilder} using the specified maxTokenLength. - * - * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. - */ - @JsonCreator - public UAX29URLEmailTokenizerBuilder(@JsonProperty("max_token_length") Integer maxTokenLength) { - this.maxTokenLength = getOrDefault(maxTokenLength, DEFAULT_MAX_TOKEN_LENGTH); - - } - - /** {@inheritDoc} */ - @Override - public UAX29URLEmailTokenizer buildTokenizer() { - UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(); - tokenizer.setMaxTokenLength(maxTokenLength); - return tokenizer; - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.java index c63daa817..18edfcd00 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.java @@ -14,19 +14,3 @@ * limitations under the License. */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - -import org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer; - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer} - * - * @author Eduardo Alonso {@literal } - */ -public class UnicodeWhitespaceTokenizerBuilder extends TokenizerBuilder { - - /** {@inheritDoc} */ - @Override - public UnicodeWhitespaceTokenizer buildTokenizer() { - return new UnicodeWhitespaceTokenizer(); - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java index 308d7e073..18edfcd00 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java @@ -14,27 +14,3 @@ * limitations under the License. */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - -import com.fasterxml.jackson.annotation.JsonCreator; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.core.WhitespaceTokenizer} - * - * @author Eduardo Alonso {@literal } - */ -public class WhitespaceTokenizerBuilder extends TokenizerBuilder { - - /** - * Builds a new {@link WhitespaceTokenizerBuilder} - */ - @JsonCreator - public WhitespaceTokenizerBuilder() { - } - - /** {@inheritDoc} */ - @Override - public WhitespaceTokenizer buildTokenizer() { - return new WhitespaceTokenizer(); - } -} diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java index 6c851f470..18edfcd00 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java @@ -14,84 +14,3 @@ * limitations under the License. */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; - -import java.util.Collections; -import java.util.Set; - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.wikipedia.WikipediaTokenizer} - * - * @author Eduardo Alonso {@literal } - */ -public class WikipediaTokenizerBuilder extends TokenizerBuilder { - - static final TokenOutputValue DEFAULT_TOKEN_OUTPUT = TokenOutputValue.TOKENS_ONLY; - static final Set DEFAULT_UNTOKENIZED_TYPES = Collections.emptySet(); - - public enum TokenOutputValue { - - TOKENS_ONLY("TOKENS_ONLY", WikipediaTokenizer.TOKENS_ONLY), - UNTOKENIZED_ONLY("UNTOKENIZED_ONLY", WikipediaTokenizer.UNTOKENIZED_ONLY), - BOTH("BOTH", WikipediaTokenizer.BOTH); - - private int integerValue; - private String stringValue; - - TokenOutputValue(String name, int value) { - this.stringValue = name; - this.integerValue = value; - } - - @JsonCreator - public static TokenOutputValue create(String value) { - if (value == null) { - throw new IllegalArgumentException(); - } - for (TokenOutputValue v : values()) { - if (v.getStringValue().equals(value)) { - return v; - } - } - throw new IllegalArgumentException(); - } - - public int getIntegerValue() { - return integerValue; - } - - public String getStringValue() { - return stringValue; - } - } - - /** this tokenizer output, only untokenized, only tokens or both */ - @JsonProperty("token_output") - final TokenOutputValue tokenOutput; - /** //TODO */ - @JsonProperty("untokenized_types") - final Set untokenizedTypes; - - /** - * Builds a new {@link WikipediaTokenizerBuilder} using the specified tokenOutput and untokenizedTypes. - * - * @param tokenOutput this tokenizer output, only untokenized, only tokens or both - * @param untokenizedTypes //TODO - */ - @JsonCreator - public WikipediaTokenizerBuilder(@JsonProperty("token_output") WikipediaTokenizerBuilder.TokenOutputValue tokenOutput, - @JsonProperty("untokenized_types") Set untokenizedTypes) { - this.tokenOutput = getOrDefault(tokenOutput, DEFAULT_TOKEN_OUTPUT); - this.untokenizedTypes = getOrDefault(untokenizedTypes, DEFAULT_UNTOKENIZED_TYPES); - } - - /** {@inheritDoc} */ - @Override - public WikipediaTokenizer buildTokenizer() { - return new WikipediaTokenizer(tokenOutput.getIntegerValue(), untokenizedTypes); - } - -} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.scala new file mode 100644 index 000000000..73f3ce26a --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.scala @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import com.fasterxml.jackson.annotation.JsonProperty +import org.apache.lucene.analysis.standard.ClassicTokenizer + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.standard.ClassicTokenizer} + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. + */ +case class ClassicTokenizerBuilder(@JsonProperty("max_token_length") maxTokenLength: Integer) extends TokenizerBuilder[ClassicTokenizer] { + + /** + * Gets or creates the Lucene {@link Tokenizer}. + * + * @return the built analyzer + */ + override val function = () => { + val tokenizer = new ClassicTokenizer() + tokenizer.setMaxTokenLength(getOrDefault(Option(maxTokenLength), + ClassicTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH).asInstanceOf[Integer]) + tokenizer + } +} + +object ClassicTokenizerBuilder { + final val DEFAULT_MAX_TOKEN_LENGTH = 250 +} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.scala new file mode 100644 index 000000000..d8f08b854 --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.scala @@ -0,0 +1,29 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import com.fasterxml.jackson.annotation.JsonProperty +import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer + +/** + * A {@link EdgeNGramTokenizer} for building {@link org.apache.lucene.analysis.ngram.EdgeNGramTokenizer} + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ +case class EdgeNGramTokenizerBuilder(@JsonProperty("min_gram") final val minGram: Integer, + @JsonProperty("max_gram") final val maxGram: Integer) extends TokenizerBuilder[EdgeNGramTokenizer] { + /** + * Builds a new {@link EdgeNGramTokenizer} using the specified minGram and manGram. + * + */ + override def function = () => { + new EdgeNGramTokenizer(getOrDefault(Option(minGram), EdgeNGramTokenizerBuilder.DEFAULT_MIN_GRAM ).asInstanceOf[Integer], + getOrDefault(Option(maxGram), EdgeNGramTokenizerBuilder.DEFAULT_MAX_GRAM ).asInstanceOf[Integer]) + } + +} + +object EdgeNGramTokenizerBuilder { + final val DEFAULT_MIN_GRAM = 1 + final val DEFAULT_MAX_GRAM = 1 +} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.scala new file mode 100644 index 000000000..28141288a --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.scala @@ -0,0 +1,24 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import com.fasterxml.jackson.annotation.JsonProperty +import org.apache.lucene.analysis.core.KeywordTokenizer + +/** + * A {@link KeywordTokenizer} for building {@link org.apache.lucene.analysis.core.KeywordTokenizer} + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + * @param bufferSize the terms cache read buffer size + */ +case class KeywordTokenizerBuilder(@JsonProperty("buffer_size") final val bufferSize: Integer = KeywordTokenizerBuilder.DEFAULT_BUFFER_SIZE) extends TokenizerBuilder[KeywordTokenizer] { + /** + * Builds a new {@link KeywordTokenizerBuilder} using the specified buffer_size. + * + */ + override def function: () => KeywordTokenizer = () => new KeywordTokenizer(getOrDefault(Option(bufferSize), KeywordTokenizerBuilder.DEFAULT_BUFFER_SIZE ).asInstanceOf[Integer]) +} + + +object KeywordTokenizerBuilder { + final val DEFAULT_BUFFER_SIZE = 256 +} + diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.scala new file mode 100644 index 000000000..fdf7973b6 --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.scala @@ -0,0 +1,19 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import org.apache.lucene.analysis.core.LetterTokenizer + +/** + * A {@link LetterTokenizer} for building {@link org.apache.lucene.analysis.core.LetterTokenizer} + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + */ +case class LetterTokenizerBuilder() extends TokenizerBuilder[LetterTokenizer] { + /** + * Builds a new {@link LetterTokenizer}. + */ + override def function: () => LetterTokenizer = () => new LetterTokenizer +} + +object LetterTokenizerBuilder {} + + diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.scala new file mode 100644 index 000000000..0bb03f94e --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.scala @@ -0,0 +1,18 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + + +import org.apache.lucene.analysis.core.LowerCaseTokenizer + +/** + * A {@link LowerCaseTokenizer} for building {@link org.apache.lucene.analysis.core.LowerCaseTokenizer} + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + */ +case class LowerCaseTokenizerBuilder() extends TokenizerBuilder[LowerCaseTokenizer]{ + /** + * Builds a new {@link LowerCaseTokenizer}. + */ + override def function: () => LowerCaseTokenizer = () => new LowerCaseTokenizer +} + +object LowerCaseTokenizerBuilder {} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.scala new file mode 100644 index 000000000..4a60fd80a --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.scala @@ -0,0 +1,28 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import com.fasterxml.jackson.annotation.JsonProperty +import org.apache.lucene.analysis.ngram.NGramTokenizer + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.ngram.NGramTokenizer} + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ +case class NGramTokenizerBuilder(@JsonProperty("min_gram") final val minGram: Integer, @JsonProperty("max_gram") final val maxGram: Integer) extends TokenizerBuilder[NGramTokenizer] { + + /** + * Builds a new {@link NGramTokenizer} using the specified minGram and manGram. + * + */ + override def function = () => new NGramTokenizer( + getOrDefault(Option(minGram), NGramTokenizerBuilder.DEFAULT_MIN_GRAM).asInstanceOf[Integer], + getOrDefault(Option(maxGram), NGramTokenizerBuilder.DEFAULT_MAX_GRAM).asInstanceOf[Integer]) + +} + +object NGramTokenizerBuilder { + final val DEFAULT_MIN_GRAM = 1 + final val DEFAULT_MAX_GRAM = 2 +} diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.scala new file mode 100644 index 000000000..29876b425 --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.scala @@ -0,0 +1,34 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import com.fasterxml.jackson.annotation.JsonProperty +import org.apache.lucene.analysis.path.PathHierarchyTokenizer + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.path.PathHierarchyTokenizer} + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + * @param bufferSize terms cache read buffer size + * @param delimiter path separator + * @param replacement a replacement character for delimiter + * @param skip number of initial tokens to skip + */ +case class PathHierarchyTokenizerBuilder(@JsonProperty("buffer_size") final val bufferSize: Integer, + @JsonProperty("delimiter") final val delimiter: Character, + @JsonProperty("replacement") final val replacement: Character, + @JsonProperty("skip") final val skip: Integer) extends TokenizerBuilder[PathHierarchyTokenizer] { + /** + * Builds a new {@link PathHierarchyTokenizer} using the specified bufferSize, delimiter, replacement and + * skip. + */ + override def function = () => new PathHierarchyTokenizer(getOrDefault(Option(bufferSize), PathHierarchyTokenizerBuilder.DEFAULT_BUFFER_SIZE).asInstanceOf[Integer], + getOrDefault(Option(delimiter), PathHierarchyTokenizerBuilder.DEFAULT_DELIMITER).asInstanceOf[Char], + getOrDefault(Option(replacement), PathHierarchyTokenizerBuilder.DEFAULT_REPLACEMENT).asInstanceOf[Char], + getOrDefault(Option(skip), PathHierarchyTokenizerBuilder.DEFAULT_SKIP).asInstanceOf[Integer]) +} + +object PathHierarchyTokenizerBuilder { + final val DEFAULT_BUFFER_SIZE = 1024 + final val DEFAULT_DELIMITER = '/' + final val DEFAULT_REPLACEMENT = '/' + final val DEFAULT_SKIP = 0 +} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.scala new file mode 100644 index 000000000..909d10a38 --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.scala @@ -0,0 +1,33 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import java.util.regex.Pattern + +import com.fasterxml.jackson.annotation.JsonProperty +import org.apache.lucene.analysis.pattern.PatternTokenizer + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.pattern.PatternTokenizer} + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com>} + * @param pattern if a token length is bigger that this, token is split at max token length intervals. + * @param flags if a token length is bigger that this, token is split at max token length intervals. + * @param group if a token length is bigger that this, token is split at max token length intervals. + */ +case class PatternTokenizerBuilder(@JsonProperty("pattern") final val pattern: String, @JsonProperty( + "flags") final val flags: Integer, @JsonProperty("group") final val group: Integer) extends TokenizerBuilder[PatternTokenizer] { + /** + * Builds a new {@link KeywordTokenizerBuilder} using the specified maxTokenLength. + * + */ + override def function = () => new PatternTokenizer(Pattern.compile( + getOrDefault(Option(pattern), PatternTokenizerBuilder.DEFAULT_PATTERN).asInstanceOf[String], + getOrDefault(Option(flags), PatternTokenizerBuilder.DEFAULT_FLAGS).asInstanceOf[Integer]), + getOrDefault(Option(group), PatternTokenizerBuilder.DEFAULT_GROUP).asInstanceOf[Integer]) +} + + +object PatternTokenizerBuilder { + final val DEFAULT_PATTERN = "\\W+" + final val DEFAULT_FLAGS = 0 + final val DEFAULT_GROUP = -1 +} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.scala new file mode 100644 index 000000000..bccece833 --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.scala @@ -0,0 +1,34 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import com.fasterxml.jackson.annotation.JsonProperty +import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer} + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + * @param bufferSize if a token length is bigger that this, token is split at max token length intervals. + * @param delimiter if a token length is bigger that this, token is split at max token length intervals. + * @param replacement if a token length is bigger that this, token is split at max token length intervals. + * @param skip if a token length is bigger that this, token is split at max token length intervals. + */ +case class ReversePathHierarchyTokenizerBuilder(@JsonProperty("buffer_size") final val bufferSize: Integer, @JsonProperty( + "delimiter") final val delimiter: Character, @JsonProperty("replacement") final val replacement: Character, @JsonProperty( + "skip") final val skip: Integer) extends TokenizerBuilder[ReversePathHierarchyTokenizer] { + + /** + * Builds a new {@link ReversePathHierarchyTokenizer} using the specified bufferSize, delimiter, replacement and skip values. + * + */ + override def function = () => new ReversePathHierarchyTokenizer(getOrDefault(Option(bufferSize), ReversePathHierarchyTokenizerBuilder.DEFAULT_BUFFER_SIZE).asInstanceOf[Int], + getOrDefault(Option(delimiter), ReversePathHierarchyTokenizerBuilder.DEFAULT_DELIMITER).asInstanceOf[Char], + getOrDefault(Option(replacement), ReversePathHierarchyTokenizerBuilder.DEFAULT_REPLACEMENT).asInstanceOf[Char], + getOrDefault(Option(skip), ReversePathHierarchyTokenizerBuilder.DEFAULT_SKIP).asInstanceOf[Int]) +} + +object ReversePathHierarchyTokenizerBuilder { + final val DEFAULT_BUFFER_SIZE = 1024 + final val DEFAULT_DELIMITER = '/' + final val DEFAULT_REPLACEMENT = '/' + final val DEFAULT_SKIP = 0 +} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.scala new file mode 100644 index 000000000..9ccb7336f --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.scala @@ -0,0 +1,26 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import com.fasterxml.jackson.annotation.JsonProperty +import org.apache.lucene.analysis.standard.StandardTokenizer + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.standard.StandardTokenizer} + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. + */ +case class StandardTokenizerBuilder(@JsonProperty("max_token_length") final val maxTokenLength: Integer) extends TokenizerBuilder[StandardTokenizer] { + /** + * Builds a new {@link KeywordTokenizerBuilder} using the specified maxTokenLength. + * + */ + override def function = () => { + val tokenizer: StandardTokenizer = new StandardTokenizer + tokenizer.setMaxTokenLength(getOrDefault(Option(maxTokenLength), StandardTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH).asInstanceOf[Int]) + tokenizer + } +} + +object StandardTokenizerBuilder { + final val DEFAULT_MAX_TOKEN_LENGTH = 255 +} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.scala new file mode 100644 index 000000000..7039e9887 --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.scala @@ -0,0 +1,17 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import org.apache.lucene.analysis.th.ThaiTokenizer + +/** + * A {@link ThaiTokenizer} for building {@link org.apache.lucene.analysis.th.ThaiTokenizer} + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + */ +case class ThaiTokenizerBuilder() extends TokenizerBuilder[ThaiTokenizer] { + /** + * Builds a new {@link ThaiTokenizer}. + */ + override def function = () => new ThaiTokenizer +} + +object ThaiTokenizerBuilder {} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala new file mode 100644 index 000000000..24a811196 --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala @@ -0,0 +1,60 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import com.fasterxml.jackson.annotation.JsonSubTypes.Type +import com.fasterxml.jackson.annotation.{JsonSubTypes, JsonTypeInfo} +import org.apache.lucene.analysis.Tokenizer + +/** + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + */ +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type") +@JsonSubTypes(Array(new Type(value = classOf[ClassicTokenizerBuilder], name = "classic"), + new Type(value = classOf[EdgeNGramTokenizerBuilder], name = "edge_ngram"), + new Type(value = classOf[KeywordTokenizerBuilder], name = "keyword"), + new Type(value = classOf[LetterTokenizerBuilder], name = "letter"), + new Type(value = classOf[LowerCaseTokenizerBuilder], name = "lower_case"), + new Type(value = classOf[NGramTokenizerBuilder], name = "ngram"), + new Type(value = classOf[PathHierarchyTokenizerBuilder], name = "path_hierarchy"), + new Type(value = classOf[PatternTokenizerBuilder], name = "pattern"), + new Type(value = classOf[ReversePathHierarchyTokenizerBuilder], name = "reverse_path_hierarchy"), + new Type(value = classOf[StandardTokenizerBuilder], name = "standard"), + new Type(value = classOf[UAX29URLEmailTokenizerBuilder], name = "uax29_url_email"), + new Type(value = classOf[UnicodeWhitespaceTokenizerBuilder], name = "unicode_whitespace"), + new Type(value = classOf[ThaiTokenizerBuilder], name = "thai"), + new Type(value = classOf[WhitespaceTokenizerBuilder], name = "whitespace"), + new Type(value = classOf[WikipediaTokenizerBuilder], name = "wikipedia")) +) trait TokenizerBuilder[T <: Tokenizer] { + /** + * + * @return + */ + def function : ()=>T + + //TODO: refactor scala style (remove throw) + /** + * + * @param throwable + * @return + */ + def failThrowException(throwable: Throwable) = throw throwable + + /** + * Gets or creates the Lucene {@link Tokenizer}. + * + * @return the built analyzer + */ + def buildTokenizer: T = { + import scala.util.control.Exception._ + //TODO: refactor scala style (manage either in other level) + catching(classOf[Exception]).either(function()).asInstanceOf[Either[Exception, T]].fold(failThrowException, x=>x) + } + + /** + * @param param the main parameter. + * @param defaultParam the default parameter if main paramaeter is null. + * @return if (param!=null) { return param; }else{ return defaultParam; } + */ + def getOrDefault(param: Option[Any], defaultParam: Any): Any = param.map(x => x).getOrElse(defaultParam) +} + +object TokenizerBuilder{} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.scala new file mode 100644 index 000000000..da8764864 --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.scala @@ -0,0 +1,26 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import com.fasterxml.jackson.annotation.JsonProperty +import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer} + * + * @author Eduardo Alonso eduardoalonso@stratio.com + * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. + */ +case class UAX29URLEmailTokenizerBuilder(@JsonProperty("max_token_length") final val maxTokenLength: Integer) extends TokenizerBuilder[UAX29URLEmailTokenizer] { + /** + * Builds a new {@link UAX29URLEmailTokenizer} using the specified maxTokenLength. + * + */ + override def function = () => { + val tokenizer: UAX29URLEmailTokenizer = new UAX29URLEmailTokenizer + tokenizer.setMaxTokenLength(getOrDefault(Option(maxTokenLength), UAX29URLEmailTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGHT).asInstanceOf[Int]) + tokenizer + } +} + +object UAX29URLEmailTokenizerBuilder { + final val DEFAULT_MAX_TOKEN_LENGHT = 255 +} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.scala new file mode 100644 index 000000000..705e7aece --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.scala @@ -0,0 +1,17 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer} + * + * @author Eduardo Alonso eduardoalonso@stratio.com + */ +case class UnicodeWhitespaceTokenizerBuilder() extends TokenizerBuilder[UnicodeWhitespaceTokenizer] { + /** + * Builds a new {@link UnicodeWhitespaceTokenizer}. + */ + override def function = () => new UnicodeWhitespaceTokenizer +} + +object UnicodeWhitespaceTokenizerBuilder {} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.scala new file mode 100644 index 000000000..9e4086678 --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.scala @@ -0,0 +1,17 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import org.apache.lucene.analysis.core.WhitespaceTokenizer + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.core.WhitespaceTokenizer} + * + * @author Eduardo Alonso duardoalonso@stratio.com + */ +case class WhitespaceTokenizerBuilder() extends TokenizerBuilder[WhitespaceTokenizer]{ + /** + * Builds a new {@link WhitespaceTokenizer}. + */ + override def function = () => new WhitespaceTokenizer +} + +object WhitespaceTokenizerBuilder {} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.scala new file mode 100644 index 000000000..8e4fb6f9e --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.scala @@ -0,0 +1,37 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import com.fasterxml.jackson.annotation.JsonProperty +import com.google.common.collect.Sets +import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer + +import scala.collection.immutable.ListSet + +/** + * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.wikipedia.WikipediaTokenizer} + * + * @author Eduardo Alonso eduardoalonso@stratio.com + * @param tokenOutput this tokenizer output, only untokenized, only tokens or both + * @param untokenizedTypes //TODO + */ +case class WikipediaTokenizerBuilder(@JsonProperty("token_output") tokenOutput: String, + @JsonProperty("untokenized_types") final val untokenizedTypes: Array[String]) extends TokenizerBuilder[WikipediaTokenizer]{ + /** + * Builds a new {@link WikipediaTokenizerBuilder} using the specified tokenOutput and untokenizedTypes. + * + */ + override def function = () => { + import collection.JavaConverters._ + new WikipediaTokenizer(getOrDefault(Option(TokenOutputEnum.withName(tokenOutput).id), TokenOutputEnum.TOKENS_ONLY).asInstanceOf[Int], untokenizedTypes.toSet.asJava) + } +} + +object WikipediaTokenizer { + final val DEFAULT_TOKEN_OUTPUT = TokenOutputEnum.TOKENS_ONLY +} + +object TokenOutputEnum extends Enumeration{ + type TokenOutputValue = Value + val TOKENS_ONLY = Value("TOKENS_ONLY") + val UNTOKENIZED_ONLY = Value("UNTOKENIZED_ONLY") + val BOTH = Value("BOTH") +} diff --git a/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java b/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java deleted file mode 100644 index 199d9d2bb..000000000 --- a/plugin/src/test/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.java +++ /dev/null @@ -1,397 +0,0 @@ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - -import com.google.common.collect.Sets; -import com.stratio.cassandra.lucene.common.JsonSerializer; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.*; -import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; -import org.apache.lucene.analysis.ngram.NGramTokenizer; -import org.apache.lucene.analysis.path.PathHierarchyTokenizer; -import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; -import org.apache.lucene.analysis.pattern.PatternTokenizer; -import org.apache.lucene.analysis.standard.ClassicTokenizer; -import org.apache.lucene.analysis.standard.StandardTokenizer; -import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer; -import org.apache.lucene.analysis.th.ThaiTokenizer; -import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; -import org.junit.Test; - -import java.io.IOException; - -import static org.junit.Assert.*; - -/** - * @author Eduardo Alonso {@literal } - */ -public class TokenizerBuilderTest { - - private T assertBuilderAndTokenizer(String json, Class expectedBuilderClass, Class expectedTokenizerClass) { - try { - TokenizerBuilder abstractBuilder = JsonSerializer.fromString(json, TokenizerBuilder.class); - assertEquals("Expected " + expectedBuilderClass.getName() + " class", - expectedBuilderClass, - abstractBuilder.getClass()); - Tokenizer tokenizer = abstractBuilder.buildTokenizer(); - assertEquals("Expected " + expectedTokenizerClass.getName() + " class", - expectedTokenizerClass, - tokenizer.getClass()); - return (T) abstractBuilder; - } catch (Exception e) { - fail(e.getLocalizedMessage()); - return null; - } - } - - private void assertJsonParseFail(String json) throws IOException { - JsonSerializer.fromString(json, TokenizerBuilder.class); - } - - private void assertJsonParseFail(String json, String message) { - try { - JsonSerializer.fromString(json, TokenizerBuilder.class); - } catch (IOException e) { - assertEquals("Expected IOException with message: " + - message + - " but received: " + - e.getMessage() + - " localMess: " + - e.getLocalizedMessage(), message, e.getMessage()); - } - assertFalse("Parsing: " + json + " must generate an IOException with message: " + message + " but does not.", - true); - } - - private void assertExactValue(String paramName, Object expected, Object received) { - assertEquals("Expected " + - paramName + - " equals to " + - expected.toString() + - " but received: " + - received.toString(), expected, received); - } - - @Test - public void testClassicTokenizerValidJSON() { - String json = "{type: \"classic\", max_token_length: 250}"; - ClassicTokenizerBuilder builder = assertBuilderAndTokenizer(json, - ClassicTokenizerBuilder.class, - ClassicTokenizer.class); - assertExactValue("ClassicTokenizerBuilder.maxTokenLength", 250, builder.maxTokenLength); - } - - @Test - public void testClassicTokenizerDefaultValues() { - ClassicTokenizerBuilder builder = assertBuilderAndTokenizer("{type: \"classic\"}", - ClassicTokenizerBuilder.class, - ClassicTokenizer.class); - assertExactValue("ClassicTokenizerBuilder.maxTokenLength", - ClassicTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH, - builder.maxTokenLength); - } - - @Test(expected = IOException.class) - public void testClassicTokenizerInvalidParam() throws IOException { - assertJsonParseFail("{type: \"classic\", max_toen_length: 250}"); - } - - @Test - public void testKeywordTokenizerValidJSON() { - String json = "{type: \"keyword\", buffer_size: 256}"; - KeywordTokenizerBuilder builder = assertBuilderAndTokenizer(json, - KeywordTokenizerBuilder.class, - KeywordTokenizer.class); - assertExactValue("KeywordTokenizer.bufferSize", 256, builder.bufferSize); - } - - @Test - public void testKeywordTokenizerDefaultValues() { - KeywordTokenizerBuilder builder = assertBuilderAndTokenizer("{type: \"keyword\"}", - KeywordTokenizerBuilder.class, - KeywordTokenizer.class); - assertExactValue("ClassicTokenizerBuilder.maxTokenLength", - KeywordTokenizerBuilder.DEFAULT_BUFFER_SIZE, - builder.bufferSize); - } - - @Test(expected = IOException.class) - public void testKeywordTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"keyword\", bufer_size: 256}"); - } - - @Test - public void testLetterTokenizerValidJSON() { - assertBuilderAndTokenizer("{type: \"letter\"}", LetterTokenizerBuilder.class, LetterTokenizer.class); - } - - @Test - public void testLowerCaseTokenizerValidJSON() { - assertBuilderAndTokenizer("{type: \"lower_case\"}", LowerCaseTokenizerBuilder.class, LowerCaseTokenizer.class); - } - - @Test - public void testThaiTokenizerValidJSON() { - assertBuilderAndTokenizer("{type: \"thai\"}", ThaiTokenizerBuilder.class, ThaiTokenizer.class); - } - - @Test - public void testNGramTokenizerValidJSON() { - String json = "{type: \"ngram\", min_gram: 1, max_gram: 2}"; - NGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, - NGramTokenizerBuilder.class, - NGramTokenizer.class); - assertExactValue("NGramTokenizerBuilder.min_gram", NGramTokenizerBuilder.DEFAULT_MIN_GRAM, builder.minGram); - assertExactValue("NGramTokenizerBuilder.max_gram", NGramTokenizerBuilder.DEFAULT_MAX_GRAM, builder.maxGram); - } - - @Test - public void testNGramTokenizerDefaultValues() { - String json = "{type: \"ngram\"}"; - NGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, - NGramTokenizerBuilder.class, - NGramTokenizer.class); - assertExactValue("NGramTokenizerBuilder.min_gram", NGramTokenizerBuilder.DEFAULT_MIN_GRAM, builder.minGram); - assertExactValue("NGramTokenizerBuilder.max_gram", NGramTokenizerBuilder.DEFAULT_MAX_GRAM, builder.maxGram); - } - - @Test(expected = IOException.class) - public void testNGramTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"ngram\", min_am: 1, max_gram: 1}"); - } - - @Test - public void testEdgeNGramTokenizerValidJSON() { - String json = "{type: \"edge_ngram\", min_gram: 1, max_gram: 1}"; - EdgeNGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, - EdgeNGramTokenizerBuilder.class, - EdgeNGramTokenizer.class); - assertExactValue("EdgeNGramTokenizerBuilder.min_gram", - EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, - builder.minGram); - assertExactValue("EdgeNGramTokenizerBuilder.max_gram", - EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE, - builder.maxGram); - } - - @Test - public void testEdgeNGramTokenizerDefaultValues() { - String json = "{type: \"edge_ngram\"}"; - EdgeNGramTokenizerBuilder builder = assertBuilderAndTokenizer(json, - EdgeNGramTokenizerBuilder.class, - EdgeNGramTokenizer.class); - assertExactValue("EdgeNGramTokenizerBuilder.min_gram", - EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, - builder.minGram); - assertExactValue("EdgeNGramTokenizerBuilder.max_gram", - EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE, - builder.maxGram); - } - - @Test(expected = IOException.class) - public void testEdgeNGramTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"edge_ngram\", min_am: 1, max_gram: 1}"); - } - - @Test - public void testPathHierarchyTokenizerValidJSON() { - String json = "{type: \"path_hierarchy\", buffer_size: 246, delimiter: \"$\", replacement: \"%\", skip: 3}"; - PathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, - PathHierarchyTokenizerBuilder.class, - PathHierarchyTokenizer.class); - assertExactValue("PathHierarchyTokenizerBuilder.buffer_size", 246, builder.bufferSize); - assertExactValue("PathHierarchyTokenizerBuilder.delimiter", '$', builder.delimiter); - assertExactValue("PathHierarchyTokenizerBuilder.replacement", '%', builder.replacement); - assertExactValue("PathHierarchyTokenizerBuilder.skip", 3, builder.skip); - } - - @Test - public void testPathHierarchyTokenizerDefaultValues() { - String json = "{type: \"path_hierarchy\"}"; - PathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, - PathHierarchyTokenizerBuilder.class, - PathHierarchyTokenizer.class); - assertExactValue("PathHierarchyTokenizerBuilder.buffer_size", - PathHierarchyTokenizerBuilder.DEFAULT_BUFFER_SIZE, - builder.bufferSize); - assertExactValue("PathHierarchyTokenizerBuilder.delimiter", - PathHierarchyTokenizerBuilder.DEFAULT_DELIMITER, - builder.delimiter); - assertExactValue("PathHierarchyTokenizerBuilder.replacement", - PathHierarchyTokenizerBuilder.DEFAULT_REPLACEMENT, - builder.replacement); - assertExactValue("PathHierarchyTokenizerBuilder.skip", - PathHierarchyTokenizerBuilder.DEFAULT_SKIP, - builder.skip); - } - - @Test(expected = IOException.class) - public void testPathHierarchyTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"path_hierarchy\", buffer_size: 246, delimter: \"$\", replacement: \"%\", skip: 3}"); - } - - @Test - public void testPatternTokenizerValidJSON() { - String json = "{type: \"pattern\", pattern: \"[a-z]\", flags: 35, group: 0}"; - PatternTokenizerBuilder builder = assertBuilderAndTokenizer(json, - PatternTokenizerBuilder.class, - PatternTokenizer.class); - assertExactValue("PathHierarchyTokenizerBuilder.pattern", "[a-z]", builder.pattern); - assertExactValue("PathHierarchyTokenizerBuilder.flags", 35, builder.flags); - assertExactValue("PathHierarchyTokenizerBuilder.group", 0, builder.group); - } - - @Test - public void testPatternTokenizerDefaultValues() { - String json = "{type: \"pattern\"}"; - PatternTokenizerBuilder builder = assertBuilderAndTokenizer(json, - PatternTokenizerBuilder.class, - PatternTokenizer.class); - assertExactValue("PathHierarchyTokenizerBuilder.pattern", - PatternTokenizerBuilder.DEFAULT_PATTERN, - builder.pattern); - assertExactValue("PathHierarchyTokenizerBuilder.group", PatternTokenizerBuilder.DEFAULT_GROUP, builder.group); - assertExactValue("PathHierarchyTokenizerBuilder.group", PatternTokenizerBuilder.DEFAULT_FLAGS, builder.flags); - } - - @Test(expected = IOException.class) - public void testPatternTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"pattern\", paern: \"[a-z]\", flags: 35, group: 0}"); - } - - @Test - public void testReversePathHierarchyTokenizerValidJSON() { - String - json - = "{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacement: \"%\", skip: 3}"; - ReversePathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, - ReversePathHierarchyTokenizerBuilder.class, - ReversePathHierarchyTokenizer.class); - assertExactValue("ReversePathHierarchyTokenizerBuilder.buffer_size", 246, builder.bufferSize); - assertExactValue("ReversePathHierarchyTokenizerBuilder.delimiter", '/', builder.delimiter); - assertExactValue("ReversePathHierarchyTokenizerBuilder.replacement", '%', builder.replacement); - assertExactValue("ReversePathHierarchyTokenizerBuilder.skip", 3, builder.skip); - } - - @Test - public void testReversePathHierarchyTokenizerDefaultValues() { - String json = "{type: \"reverse_path_hierarchy\"}"; - ReversePathHierarchyTokenizerBuilder builder = assertBuilderAndTokenizer(json, - ReversePathHierarchyTokenizerBuilder.class, - ReversePathHierarchyTokenizer.class); - assertExactValue("PathHierarchyTokenizerBuilder.buffer_size", - ReversePathHierarchyTokenizerBuilder.DEFAULT_BUFFER_SIZE, - builder.bufferSize); - assertExactValue("PathHierarchyTokenizerBuilder.delimiter", - ReversePathHierarchyTokenizerBuilder.DEFAULT_DELIMITER, - builder.delimiter); - assertExactValue("PathHierarchyTokenizerBuilder.replacement", - ReversePathHierarchyTokenizerBuilder.DEFAULT_REPLACEMENT, - builder.replacement); - assertExactValue("PathHierarchyTokenizerBuilder.skip", - ReversePathHierarchyTokenizerBuilder.DEFAULT_SKIP, - builder.skip); - } - - @Test(expected = IOException.class) - public void testReversePathHierarchyTokenizerInvalidJSON() throws IOException { - assertJsonParseFail( - "{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacent: \"%\", skip: 3}"); - } - - @Test - public void testStandardTokenizerValidJSON() { - String json = "{type: \"standard\", max_token_length: 246}"; - StandardTokenizerBuilder builder = assertBuilderAndTokenizer(json, - StandardTokenizerBuilder.class, - StandardTokenizer.class); - assertExactValue("StandardTokenizerBuilder.maxTokenLength", 246, builder.maxTokenLength); - } - - @Test - public void testStandardTokenizerDefaultValues() { - StandardTokenizerBuilder builder = assertBuilderAndTokenizer("{type: \"standard\"}", - StandardTokenizerBuilder.class, - StandardTokenizer.class); - assertExactValue("ClassicTokenizerBuilder.maxTokenLength", - StandardTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH, - builder.maxTokenLength); - } - - @Test(expected = IOException.class) - public void testStandardTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"standard\", max_token_ngth: 246}"); - } - - @Test - public void testUAX29URLEmailTokenizerValidJSON() { - String json = "{type: \"uax29_url_email\", max_token_length: 249}"; - UAX29URLEmailTokenizerBuilder builder = assertBuilderAndTokenizer(json, - UAX29URLEmailTokenizerBuilder.class, - UAX29URLEmailTokenizer.class); - assertExactValue("UAX29URLEmailTokenizerBuilder.maxTokenLength", 249, builder.maxTokenLength); - } - - @Test - public void testUAX29URLEmailTokenizerDefaultValues() { - String json = "{type: \"uax29_url_email\"}"; - UAX29URLEmailTokenizerBuilder builder = assertBuilderAndTokenizer(json, - UAX29URLEmailTokenizerBuilder.class, - UAX29URLEmailTokenizer.class); - assertExactValue("UAX29URLEmailTokenizerBuilder.maxTokenLength", - UAX29URLEmailTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH, - builder.maxTokenLength); - } - - @Test(expected = IOException.class) - public void testUAX29URLEmailTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"uax29_url_email\", max_token_lgth: 249}"); - } - - @Test - public void testUnicodeWhitespaceTokenizerValidJSON() { - String json = "{type:\"unicode_whitespace\"}"; - assertBuilderAndTokenizer(json, UnicodeWhitespaceTokenizerBuilder.class, UnicodeWhitespaceTokenizer.class); - } - - @Test - public void testWhitespaceTokenizerValidJSON() { - String json = "{type:\"whitespace\"}"; - assertBuilderAndTokenizer(json, WhitespaceTokenizerBuilder.class, WhitespaceTokenizer.class); - } - - @Test - public void testWikipediaTokenizerValidJSON() { - String json = "{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untokenized_types : [\"aaa\",\"bbb\"]}"; - WikipediaTokenizerBuilder builder = assertBuilderAndTokenizer(json, - WikipediaTokenizerBuilder.class, - WikipediaTokenizer.class); - assertExactValue("WikipediaTokenizerBuilder.token_output", - WikipediaTokenizerBuilder.TokenOutputValue.TOKENS_ONLY, - builder.tokenOutput); - assertExactValue("WikipediaTokenizerBuilder.untokenized_types", - Sets.newHashSet("aaa", "bbb"), - builder.untokenizedTypes); - } - - @Test - public void testWikipediaTokenizerDefaultValues() { - String json = "{type: \"wikipedia\"}"; - WikipediaTokenizerBuilder builder = assertBuilderAndTokenizer(json, - WikipediaTokenizerBuilder.class, - WikipediaTokenizer.class); - assertExactValue("WikipediaTokenizerBuilder.token_output", - WikipediaTokenizerBuilder.TokenOutputValue.TOKENS_ONLY, - builder.tokenOutput); - assertExactValue("WikipediaTokenizerBuilder.untokenized_types", Sets.newHashSet(), builder.untokenizedTypes); - } - - @Test(expected = IOException.class) - public void testWikipediaTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untoknized_types : [\"aaa\",\"bbb\"]}"); - } - - @Test(expected = IOException.class) - public void testInvalidTokenizerType() throws IOException { - assertJsonParseFail("{type: \"invalid_type\"}"); - } - -} diff --git a/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.scala b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.scala new file mode 100644 index 000000000..3a04cdf62 --- /dev/null +++ b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.scala @@ -0,0 +1,248 @@ +package com.stratio.cassandra.lucene.schema.analysis.tokenizer + +import com.stratio.cassandra.lucene.BaseScalaTest +import com.stratio.cassandra.lucene.common.JsonSerializer +import org.apache.lucene.analysis.core._ +import org.apache.lucene.analysis.ngram.{NGramTokenizer, EdgeNGramTokenizer} +import org.apache.lucene.analysis.path.{ReversePathHierarchyTokenizer, PathHierarchyTokenizer} +import org.apache.lucene.analysis.pattern.PatternTokenizer +import org.apache.lucene.analysis.standard.{UAX29URLEmailTokenizer, StandardTokenizer, ClassicTokenizer} +import org.apache.lucene.analysis.th.ThaiTokenizer +import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +import scala.util.Try + +/** Tests for [[TokenizerBuilder]]. + * + * @author Juan Pedro Gilaberte `jpgilaberte@stratio.com` + */ + +@RunWith(classOf[JUnitRunner]) +class TokenizerBuilderTest extends BaseScalaTest{ + + def failFlow(throwable: Throwable) = fail(throwable.getMessage, throwable) + + def buildAbstractBuilder(json: String, builderClass: Class[_]): Any = Try(JsonSerializer.fromString(json, builderClass)).fold(failFlow, x=>x) + + test("ClassicTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"classic\", max_token_length: 1}", + classOf[TokenizerBuilder[ClassicTokenizer]]).asInstanceOf[TokenizerBuilder[ClassicTokenizer]] + assert(classOf[ClassicTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[ClassicTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + assert(1 == tokenizer.getMaxTokenLength) + } + + test("ClassicTokenizerBuilder parse JSON throw IllegalArgumentException") { + val abstractBuilder = buildAbstractBuilder("{type: \"classic\", max_token_length: 0}", + classOf[TokenizerBuilder[ClassicTokenizer]]).asInstanceOf[TokenizerBuilder[ClassicTokenizer]] + assert(classOf[ClassicTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} + } + + test("ClassicTokenizerBuilder parse JSON default values") { + val abstractBuilder = buildAbstractBuilder("{type: \"classic\"}", + classOf[TokenizerBuilder[ClassicTokenizer]]).asInstanceOf[TokenizerBuilder[ClassicTokenizer]] + assert(classOf[ClassicTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[ClassicTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + assert(ClassicTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH == tokenizer.getMaxTokenLength) + } + + test("EdgeNGramTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"edge_ngram\", min_gram: 1, max_gram: 2}", + classOf[TokenizerBuilder[EdgeNGramTokenizer]]).asInstanceOf[TokenizerBuilder[EdgeNGramTokenizer]] + assert(classOf[EdgeNGramTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[EdgeNGramTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + assert(1 == abstractBuilder.asInstanceOf[EdgeNGramTokenizerBuilder].minGram) + assert(2 == abstractBuilder.asInstanceOf[EdgeNGramTokenizerBuilder].maxGram) + } + + test("EdgeNGramTokenizerBuilder parse JSON throws IllegalArgumentException") { + val abstractBuilder = buildAbstractBuilder("{type: \"edge_ngram\", min_gram: -1, max_gram: 2}", + classOf[TokenizerBuilder[EdgeNGramTokenizer]]).asInstanceOf[TokenizerBuilder[EdgeNGramTokenizer]] + assert(classOf[EdgeNGramTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} + } + + test("KeywordTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"keyword\", buffer_size: 256}", + classOf[TokenizerBuilder[KeywordTokenizer]]).asInstanceOf[TokenizerBuilder[KeywordTokenizer]] + assert(classOf[KeywordTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[KeywordTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + assert(256 == abstractBuilder.asInstanceOf[KeywordTokenizerBuilder].bufferSize) + } + + test("KeywordTokenizerBuilder parse JSON throw IllegalArgumentException") { + val abstractBuilder = buildAbstractBuilder("{type: \"keyword\", buffer_size: -256}", + classOf[TokenizerBuilder[ClassicTokenizer]]).asInstanceOf[TokenizerBuilder[ClassicTokenizer]] + assert(classOf[KeywordTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} + } + + test("LetterTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"letter\"}", + classOf[TokenizerBuilder[LetterTokenizer]]).asInstanceOf[TokenizerBuilder[LetterTokenizer]] + assert(classOf[LetterTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[LetterTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + } + + test("LowerCaseTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"lower_case\"}", + classOf[TokenizerBuilder[LowerCaseTokenizer]]).asInstanceOf[TokenizerBuilder[LowerCaseTokenizer]] + assert(classOf[LowerCaseTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[LowerCaseTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + } + + test("NGramTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"ngram\", min_gram: 1, max_gram: 2}", + classOf[TokenizerBuilder[NGramTokenizer]]).asInstanceOf[TokenizerBuilder[NGramTokenizer]] + assert(classOf[NGramTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[NGramTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + assert(1 == abstractBuilder.asInstanceOf[NGramTokenizerBuilder].minGram) + assert(2 == abstractBuilder.asInstanceOf[NGramTokenizerBuilder].maxGram) + } + + test("NGramTokenizerBuilder parse JSON throws IllegalArgumentException") { + val abstractBuilder = buildAbstractBuilder("{type: \"ngram\", min_gram: -1, max_gram: 2}", + classOf[TokenizerBuilder[NGramTokenizer]]).asInstanceOf[TokenizerBuilder[NGramTokenizer]] + assert(classOf[NGramTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} + } + + test("PathHierarchyTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"path_hierarchy\", buffer_size: 246, delimiter: \"$\", replacement: \"%\", skip: 3}", + classOf[TokenizerBuilder[PathHierarchyTokenizer]]).asInstanceOf[TokenizerBuilder[PathHierarchyTokenizer]] + assert(classOf[PathHierarchyTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[PathHierarchyTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + assert(246 == abstractBuilder.asInstanceOf[PathHierarchyTokenizerBuilder].bufferSize) + assert('$' == abstractBuilder.asInstanceOf[PathHierarchyTokenizerBuilder].delimiter) + assert('%' == abstractBuilder.asInstanceOf[PathHierarchyTokenizerBuilder].replacement) + assert(3 == abstractBuilder.asInstanceOf[PathHierarchyTokenizerBuilder].skip) + } + + test("PathHierarchyTokenizerBuilder parse JSON throws IllegalArgumentException") { + val abstractBuilder = buildAbstractBuilder("{type: \"path_hierarchy\", buffer_size: 246, delimiter: \"$\", replacement: \"%\", skip: -3}", + classOf[TokenizerBuilder[PathHierarchyTokenizer]]).asInstanceOf[TokenizerBuilder[PathHierarchyTokenizer]] + assert(classOf[PathHierarchyTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} + } + + test("PatternTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"pattern\", pattern: \"[a-z]\", flags: 35, group: 0}", + classOf[TokenizerBuilder[PatternTokenizer]]).asInstanceOf[TokenizerBuilder[PatternTokenizer]] + assert(classOf[PatternTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[PatternTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + assert("[a-z]" == abstractBuilder.asInstanceOf[PatternTokenizerBuilder].pattern) + assert(35 == abstractBuilder.asInstanceOf[PatternTokenizerBuilder].flags) + assert(0 == abstractBuilder.asInstanceOf[PatternTokenizerBuilder].group) + } + + test("PatternTokenizerBuilder parse JSON throws IllegalArgumentException") { + val abstractBuilder = buildAbstractBuilder("{type: \"pattern\", pattern: \"[a-z]\", flags: 35, group: 2}", + classOf[TokenizerBuilder[PatternTokenizer]]).asInstanceOf[TokenizerBuilder[PatternTokenizer]] + assert(classOf[PatternTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} + } + + test("ReversePathHierarchyTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacement: \"%\", skip: 3}", + classOf[TokenizerBuilder[ReversePathHierarchyTokenizer]]).asInstanceOf[TokenizerBuilder[ReversePathHierarchyTokenizer]] + assert(classOf[ReversePathHierarchyTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[ReversePathHierarchyTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + assert(246 == abstractBuilder.asInstanceOf[ReversePathHierarchyTokenizerBuilder].bufferSize) + assert('/' == abstractBuilder.asInstanceOf[ReversePathHierarchyTokenizerBuilder].delimiter) + assert('%' == abstractBuilder.asInstanceOf[ReversePathHierarchyTokenizerBuilder].replacement) + assert(3 == abstractBuilder.asInstanceOf[ReversePathHierarchyTokenizerBuilder].skip) + } + + test("ReversePathHierarchyTokenizerBuilder parse JSON throws IllegalArgumentException") { + val abstractBuilder = buildAbstractBuilder("{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacement: \"%\", skip: -3}", + classOf[TokenizerBuilder[ReversePathHierarchyTokenizer]]).asInstanceOf[TokenizerBuilder[ReversePathHierarchyTokenizer]] + assert(classOf[ReversePathHierarchyTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} + } + + test("StandardTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"standard\", max_token_length: 246}", + classOf[TokenizerBuilder[StandardTokenizer]]).asInstanceOf[TokenizerBuilder[StandardTokenizer]] + assert(classOf[StandardTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[StandardTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + assert(246 == abstractBuilder.asInstanceOf[StandardTokenizerBuilder].maxTokenLength) + } + + test("StandardTokenizerBuilder parse JSON throws IllegalArgumentException") { + val abstractBuilder = buildAbstractBuilder("{type: \"standard\", max_token_length: -246}", + classOf[TokenizerBuilder[StandardTokenizer]]).asInstanceOf[TokenizerBuilder[StandardTokenizer]] + assert(classOf[StandardTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} + } + + test("ThaiTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"thai\"}", + classOf[TokenizerBuilder[ThaiTokenizer]]).asInstanceOf[TokenizerBuilder[ThaiTokenizer]] + assert(classOf[ThaiTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[ThaiTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + } + + test("UAX29URLEmailTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"uax29_url_email\", max_token_length: 249}", + classOf[TokenizerBuilder[UAX29URLEmailTokenizer]]).asInstanceOf[TokenizerBuilder[UAX29URLEmailTokenizer]] + assert(classOf[UAX29URLEmailTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[UAX29URLEmailTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + assert(249 == abstractBuilder.asInstanceOf[UAX29URLEmailTokenizerBuilder].maxTokenLength) + } + + test("UAX29URLEmailTokenizerBuilder parse JSON throws IllegalArgumentException") { + val abstractBuilder = buildAbstractBuilder("{type: \"uax29_url_email\", max_token_length: -249}", + classOf[TokenizerBuilder[UAX29URLEmailTokenizer]]).asInstanceOf[TokenizerBuilder[UAX29URLEmailTokenizer]] + assert(classOf[UAX29URLEmailTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} + } + + test("UnicodeWhitespaceTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type:\"unicode_whitespace\"}", + classOf[TokenizerBuilder[UnicodeWhitespaceTokenizer]]).asInstanceOf[TokenizerBuilder[UnicodeWhitespaceTokenizer]] + assert(classOf[UnicodeWhitespaceTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[UnicodeWhitespaceTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + } + + test("WhitespaceTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type:\"whitespace\"}", + classOf[TokenizerBuilder[WhitespaceTokenizer]]).asInstanceOf[TokenizerBuilder[WhitespaceTokenizer]] + assert(classOf[WhitespaceTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[WhitespaceTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + } + + test("WikipediaTokenizerBuilder parse JSON") { + val abstractBuilder = buildAbstractBuilder("{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untokenized_types : [\"aaa\",\"bbb\"]}", + classOf[TokenizerBuilder[WikipediaTokenizer]]).asInstanceOf[TokenizerBuilder[WikipediaTokenizer]] + assert(classOf[WikipediaTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + val tokenizer = abstractBuilder.buildTokenizer + assert(classOf[WikipediaTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) + assert(TokenOutputEnum.TOKENS_ONLY.toString == abstractBuilder.asInstanceOf[WikipediaTokenizerBuilder].tokenOutput) + assert(Array("aaa", "bbb").apply(1) == abstractBuilder.asInstanceOf[WikipediaTokenizerBuilder].untokenizedTypes.apply(1)) + } + + test("WikipediaTokenizerBuilder parse JSON throws NoSuchElementException") { + val abstractBuilder = buildAbstractBuilder("{type: \"wikipedia\", token_output: \"OKENS_ONLY\", untokenized_types : [\"aaa\",\"bbb\"]}", + classOf[TokenizerBuilder[WikipediaTokenizer]]).asInstanceOf[TokenizerBuilder[WikipediaTokenizer]] + assert(classOf[WikipediaTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) + assertThrows[NoSuchElementException]{abstractBuilder.buildTokenizer} + } +} diff --git a/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java b/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java new file mode 100644 index 000000000..8d10474be --- /dev/null +++ b/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java @@ -0,0 +1,695 @@ +package com.stratio.cassandra.lucene.testsAT.schema.analysis.tokenizer; + +import com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer.*; +import com.stratio.cassandra.lucene.testsAT.BaseIT; +import com.stratio.cassandra.lucene.testsAT.util.CassandraUtils; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +import static com.stratio.cassandra.lucene.builder.Builder.*; + +/** + * Test partitioning on partition key column. + * + * @author Andres de la Pena {@literal } + */ +@RunWith(JUnit4.class) +public class TokenizerBuilderIT extends BaseIT{ + + private static CassandraUtils utils; + + @BeforeClass + public static void before() {} + + @AfterClass + public static void after() { + CassandraUtils.dropKeyspaceIfNotNull(utils); + } + + @Test + public void testClassicTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new ClassicTokenizer())) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "The")).check(1) + .filter(match("rc", "the")).check(1) + .filter(match("rc", "2")).check(1) + .filter(match("rc", "QUICK")).check(1) + .filter(match("rc", "quick")).check(0) + .filter(match("rc", "Quick")).check(0) + .filter(match("rc", "Brown")).check(1) + .filter(match("rc", "brown")).check(0) + .filter(match("rc", "brOwn")).check(0) + .filter(match("rc", "Foxes")).check(1) + .filter(match("rc", "-Foxes")).check(1) + .filter(match("rc", "Brown-Foxes")).check(1) + .filter(match("rc", "BrownFoxes")).check(0) + .filter(match("rc", "brown-Foxes")).check(0) + .filter(match("rc", "brown-foxes")).check(0) + .filter(match("rc", "jumped")).check(1) + .filter(match("rc", "jump")).check(0) + .filter(match("rc", "dog")).check(1) + .filter(match("rc", "dogs")).check(0) + .filter(match("rc", "and")).check(1) + .filter(match("rc", "or")).check(1) + .filter(match("rc", "and/or")).check(1) + .filter(match("rc", "and*or")).check(1) + .filter(match("rc", "and-or")).check(1) + .filter(phrase("rc", "jumped the")).check(1) + .filter(phrase("rc", "jump the")).check(0) + .filter(phrase("rc", " dog bone. and/or")).check(1) + .filter(phrase("rc", " dog bone and/or")).check(1) + .filter(phrase("rc", " dog bone/ and*or")).check(1) + .filter(phrase("rc", " dog Bone. and/or")).check(0) + .filter(phrase("rc", " dog bone. and or")).check(1) + .filter(phrase("rc", " dog bone and or")).check(1) + .filter(fuzzy("rc", "jumped")).check(1) + .filter(fuzzy("rc", "jump")).check(1) + .filter(fuzzy("rc", "jumper")).check(1) + .filter(fuzzy("rc", "ajumper")).check(1) + .filter(fuzzy("rc", "gjumperd")).check(1) + .filter(fuzzy("rc", "dogjumperdog")).check(0) + .filter(contains("rc", "jumped")).check(1) + //TODO: check this behaviour + .filter(contains("rc", "jump")).check(0) + .filter(contains("rc", "jumper")).check(0) + .filter(contains("rc", "ajumped")).check(0) + .filter(prefix("rc", "jump")).check(1) + .filter(prefix("rc", "ju")).check(1) + .filter(regexp("rc", "[j][aeiou]{1}.*")).check(1) + .filter(regexp("rc", "[j][aeiou]{2}.*")).check(0) + .filter(wildcard("rc", "*jumpe*")).check(1) + .filter(wildcard("rc", "*ju*pe*")).check(1) + .filter(wildcard("rc", "*jum*pe*")).check(1) + .filter(wildcard("rc", "jumpe*")).check(1) + .filter(wildcard("rc", "**e*")).check(1) + .filter(wildcard("rc", "and*or*")).check(0) + .filter(wildcard("rc", "and/or*")).check(0) + .filter(wildcard("rc", "*and/or")).check(0) + .filter(wildcard("rc", "and*or")).check(0) + .filter(wildcard("rc", "*and/or*")).check(0) + .filter(wildcard("rc", "*and/or*")).check(0) + .filter(wildcard("rc", "*.and.*")).check(0); + } + + @Test + public void testNGramTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new NGramTokenizer(1, 2))) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "abcde") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "a")).check(1) + .filter(match("rc", "ab")).check(1) + .filter(match("rc", "abc")).check(1) + .filter(match("rc", "abcde")).check(1) + .filter(match("rc", "cd")).check(1) + .filter(match("rc", "de")).check(1) + .filter(match("rc", "be")).check(0) + .filter(phrase("rc", "a")).check(1) + .filter(phrase("rc", "ab")).check(1) + .filter(phrase("rc", "abc")).check(1) + .filter(phrase("rc", "abcde")).check(1) + .filter(phrase("rc", "cd")).check(1) + .filter(phrase("rc", "de")).check(1) + .filter(phrase("rc", "be")).check(0) + .filter(fuzzy("rc", "a")).check(1) + .filter(fuzzy("rc", "ab")).check(1) + .filter(fuzzy("rc", "abc")).check(1) + .filter(fuzzy("rc", "cd")).check(1) + .filter(fuzzy("rc", "de")).check(1) + .filter(contains("rc", "a")).check(1) + .filter(contains("rc", "abc")).check(1) + .filter(contains("rc", "be")).check(0) + .filter(prefix("rc", "ab")).check(1) + .filter(prefix("rc", "bc")).check(1) + .filter(regexp("rc", "[aeiou]{1}bc*")).check(1) + .filter(regexp("rc", "[j][aeiou]{2}.*")).check(0) + .filter(wildcard("rc", "a*")).check(1) + .filter(wildcard("rc", "*cde*")).check(0); + } + + @Test + public void testKeywordTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new KeywordTokenizer())) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or")).check(1) + .filter(match("rc", "The 2 QUICK Brown-Foxes jumped the")).check(0) + .filter(phrase("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or")).check(1) + .filter(phrase("rc", "The 2 QUICK Brown-Foxes jumped the")).check(0) + .filter(fuzzy("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or")).check(1) + .filter(fuzzy("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/o")).check(1) + .filter(fuzzy("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone.")).check(0) + //TODO: check this behaviour + .filter(contains("rc", "The 2 QUICK Brown-Foxes")).check(0) + .filter(contains("rc", "jump")).check(0) + .filter(contains("rc", "and/or")).check(0) + .filter(contains("rc", "ajumped")).check(0) + .filter(prefix("rc", "The")).check(1) + .filter(prefix("rc", "ju")).check(0) + .filter(wildcard("rc", "*2 QUICK Brown-Foxes jumped the lazy dog bone. and/or")).check(1) + .filter(wildcard("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone.*")).check(1) + .filter(wildcard("rc", "*Brown-Foxes jumped the lazy dog bone.*")).check(1) + .filter(wildcard("rc", "*QUICK * jumped*")).check(1); + + } + + @Test + public void testLetterTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new LetterTokenizer())) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or")).check(1) + .filter(match("rc", "The 2 QUICK Brown-Foxes jumped the")).check(1) + .filter(phrase("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or")).check(1) + .filter(phrase("rc", "The 2 QUICK Brown-Foxes jumped the")).check(1) + .filter(fuzzy("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/o")).check(0) + .filter(fuzzy("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone.")).check(0) + //TODO: check this behaviour + .filter(contains("rc", "The 2 QUICK Brown-Foxes")).check(1) + .filter(contains("rc", "jump")).check(0) + .filter(contains("rc", "and/or")).check(1) + .filter(contains("rc", "ajumped")).check(0) + .filter(prefix("rc", "The")).check(1) + .filter(prefix("rc", "ju")).check(1) + .filter(wildcard("rc", "*")).check(1) + .filter(wildcard("rc", "*QUICK*")).check(1); + } + + @Test + public void testLowerCaseTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new LowerCaseTokenizer())) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or")).check(1) + .filter(match("rc", "The 2 QUICK Brown-Foxes jumped the")).check(1) + .filter(phrase("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or")).check(1) + .filter(phrase("rc", "The 2 QUICK Brown-Foxes jumped the")).check(1) + .filter(fuzzy("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/o")).check(0) + .filter(fuzzy("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone.")).check(0) + //TODO: check this behaviour + .filter(contains("rc", "The 2 QUICK Brown-Foxes")).check(1) + .filter(contains("rc", "jump")).check(0) + .filter(contains("rc", "and/or")).check(1) + .filter(contains("rc", "ajumped")).check(0) + .filter(prefix("rc", "the")).check(1) + .filter(prefix("rc", "The")).check(0) + .filter(prefix("rc", "ju")).check(1) + .filter(wildcard("rc", "*")).check(1) + .filter(wildcard("rc", "*quick*")).check(1); + } + + @Test + public void testEdgeNGramTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new EdgeNGramTokenizer(1, 2))) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "abcde") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "a")).check(1) + .filter(match("rc", "ab")).check(1) + .filter(match("rc", "abc")).check(1) + .filter(match("rc", "abcde")).check(1) + .filter(match("rc", "cd")).check(0) + .filter(match("rc", "de")).check(0) + .filter(match("rc", "be")).check(0) + .filter(phrase("rc", "a")).check(1) + .filter(phrase("rc", "ab")).check(1) + .filter(phrase("rc", "abc")).check(1) + .filter(phrase("rc", "abcde")).check(1) + .filter(phrase("rc", "cd")).check(0) + .filter(phrase("rc", "de")).check(0) + .filter(phrase("rc", "be")).check(0) + .filter(fuzzy("rc", "a")).check(1) + .filter(fuzzy("rc", "ab")).check(1) + .filter(fuzzy("rc", "abc")).check(1) + .filter(fuzzy("rc", "cd")).check(0) + .filter(fuzzy("rc", "de")).check(0) + .filter(contains("rc", "a")).check(1) + .filter(contains("rc", "abc")).check(1) + .filter(contains("rc", "be")).check(0) + .filter(prefix("rc", "ab")).check(1) + .filter(prefix("rc", "bc")).check(0) + .filter(regexp("rc", "[aeiou]{1}bc*")).check(1) + .filter(regexp("rc", "[j][aeiou]{2}.*")).check(0) + .filter(wildcard("rc", "a*")).check(1) + .filter(wildcard("rc", "*cde*")).check(0); + } + + @Test + public void testPathHierarchyTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new PathHierarchyTokenizer())) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "/a/b/c/d/e/f/g") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "/a/c")).check(1) + .filter(match("rc", "/ac")).check(0) + .filter(match("rc", "/a/e")).check(1) + .filter(match("rc", "/a/e/g")).check(1) + .filter(match("rc", "/a")).check(1) + .filter(match("rc", "/a/b")).check(1) + .filter(match("rc", "/a/b/c")).check(1) + .filter(fuzzy("rc", "/a")).check(1) + .filter(fuzzy("rc", "/a/c")).check(1) + .filter(fuzzy("rc", "a/e/g")).check(0) + .filter(match("rc", "abc")).check(0) + .filter(contains("rc", "/a")).check(1) + .filter(contains("rc", "/a/b/c")).check(1) + .filter(contains("rc", "b/c")).check(0) + .filter(prefix("rc", "/a/b")).check(1) + .filter(prefix("rc", "b/c")).check(0) + .filter(regexp("rc", "/[aeiou]{1}/b/c*")).check(1) + .filter(regexp("rc", "[j][aeiou]{2}.*")).check(0) + .filter(wildcard("rc", "/a*")).check(1) + .filter(wildcard("rc", "*/c*")).check(1) + .filter(wildcard("rc", "*c/*")).check(1); + } + + @Test + public void testPatternTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new PatternTokenizer("/", 0, -1))) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "/a/b/c/d/e/f/g") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "a")).check(1) + .filter(match("rc", "/ac")).check(0) + .filter(match("rc", "/a/e")).check(0) + .filter(match("rc", "/a/e/g")).check(0) + .filter(match("rc", "a/b")).check(1) + .filter(match("rc", "/a/b/c")).check(1) + .filter(fuzzy("rc", "a")).check(1) + .filter(fuzzy("rc", "/a")).check(0) + .filter(fuzzy("rc", "b")).check(1) + .filter(fuzzy("rc", "a/e/g")).check(0) + .filter(match("rc", "abc")).check(0) + .filter(contains("rc", "/a")).check(1) + .filter(contains("rc", "/a/b/c")).check(1) + .filter(contains("rc", "b/c")).check(1) + .filter(prefix("rc", "a")).check(1) + .filter(prefix("rc", "b/c")).check(0) + .filter(regexp("rc", "[aeiou]{1}")).check(1) + .filter(regexp("rc", "[j][aeiou]{2}.*")).check(0) + .filter(wildcard("rc", "a*")).check(1) + .filter(wildcard("rc", "*/c*")).check(0) + .filter(wildcard("rc", "*c*")).check(1); + } + + @Test + public void testReversePathHierarchyTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new ReversePathHierarchyTokenizer())) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "g/f/e/d/c/b/a/") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "a/")).check(1) + .filter(match("rc", "/ac")).check(0) + .filter(match("rc", "/a/e")).check(0) + .filter(match("rc", "/a/e/g")).check(0) + .filter(match("rc", "b/a/")).check(1) + .filter(match("rc", "c/b/a/")).check(1) + .filter(match("rc", "abc")).check(0) + .filter(fuzzy("rc", "a/")).check(1) + .filter(fuzzy("rc", "/a")).check(1) + .filter(fuzzy("rc", "b/")).check(1) + .filter(fuzzy("rc", "a/e/g")).check(0) + .filter(contains("rc", "a/")).check(1) + .filter(contains("rc", "c/b/a/")).check(1) + .filter(contains("rc", "b/c")).check(0) + .filter(prefix("rc", "a")).check(1) + .filter(prefix("rc", "b/c")).check(0) + .filter(regexp("rc", "[aeiou]{1}/")).check(1) + .filter(regexp("rc", "[j][aeiou]{2}.*")).check(0) + .filter(wildcard("rc", "a*")).check(1) + .filter(wildcard("rc", "*c/*")).check(1) + .filter(wildcard("rc", "*c*")).check(1); + } + + @Test + public void testStandardTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new StandardTokenizer())) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "The")).check(1) + .filter(match("rc", "the")).check(1) + .filter(match("rc", "2")).check(1) + .filter(match("rc", "QUICK")).check(1) + .filter(match("rc", "quick")).check(0) + .filter(match("rc", "Quick")).check(0) + .filter(match("rc", "Brown")).check(1) + .filter(match("rc", "brown")).check(0) + .filter(match("rc", "brOwn")).check(0) + .filter(match("rc", "Foxes")).check(1) + .filter(match("rc", "-Foxes")).check(1) + .filter(match("rc", "Brown-Foxes")).check(1) + .filter(match("rc", "BrownFoxes")).check(0) + .filter(match("rc", "brown-Foxes")).check(0) + .filter(match("rc", "brown-foxes")).check(0) + .filter(match("rc", "jumped")).check(1) + .filter(match("rc", "jump")).check(0) + .filter(match("rc", "dog")).check(1) + .filter(match("rc", "dogs")).check(0) + .filter(match("rc", "and")).check(1) + .filter(match("rc", "or")).check(1) + .filter(match("rc", "and/or")).check(1) + .filter(match("rc", "and*or")).check(1) + .filter(match("rc", "and-or")).check(1) + .filter(phrase("rc", "jumped the")).check(1) + .filter(phrase("rc", "jump the")).check(0) + .filter(phrase("rc", " dog bone. and/or")).check(1) + .filter(phrase("rc", " dog bone and/or")).check(1) + .filter(phrase("rc", " dog bone/ and*or")).check(1) + .filter(phrase("rc", " dog Bone. and/or")).check(0) + .filter(phrase("rc", " dog bone. and or")).check(1) + .filter(phrase("rc", " dog bone and or")).check(1) + .filter(fuzzy("rc", "jumped")).check(1) + .filter(fuzzy("rc", "jump")).check(1) + .filter(fuzzy("rc", "jumper")).check(1) + .filter(fuzzy("rc", "ajumper")).check(1) + .filter(fuzzy("rc", "gjumperd")).check(1) + .filter(fuzzy("rc", "dogjumperdog")).check(0) + .filter(contains("rc", "jumped")).check(1) + //TODO: check this behaviour + .filter(contains("rc", "jump")).check(0) + .filter(contains("rc", "jumper")).check(0) + .filter(contains("rc", "ajumped")).check(0) + .filter(prefix("rc", "jump")).check(1) + .filter(prefix("rc", "ju")).check(1) + .filter(regexp("rc", "[j][aeiou]{1}.*")).check(1) + .filter(regexp("rc", "[j][aeiou]{2}.*")).check(0) + .filter(wildcard("rc", "*jumpe*")).check(1) + .filter(wildcard("rc", "*ju*pe*")).check(1) + .filter(wildcard("rc", "*jum*pe*")).check(1) + .filter(wildcard("rc", "jumpe*")).check(1) + .filter(wildcard("rc", "**e*")).check(1) + .filter(wildcard("rc", "and*or*")).check(0) + .filter(wildcard("rc", "and/or*")).check(0) + .filter(wildcard("rc", "*and/or")).check(0) + .filter(wildcard("rc", "and*or")).check(0) + .filter(wildcard("rc", "*and/or*")).check(0) + .filter(wildcard("rc", "*and/or*")).check(0) + .filter(wildcard("rc", "*.and.*")).check(0); + } + + @Test + public void testThaiTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new ThaiTokenizer())) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "การที่ได้ต้องแสดงว่างานดี") // { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" } + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "การ")).check(1) + .filter(match("rc", "การที่")).check(1) + .filter(phrase("rc", "ได้ ต้อง")).check(1) + .filter(phrase("rc", "แสดง งาน")).check(0) + .filter(fuzzy("rc", "การ")).check(1) + .filter(fuzzy("rc", "การว่า")).check(0) + .filter(contains("rc", "สดง")).check(0) + .filter(prefix("rc", "กา")).check(1) + .filter(prefix("rc", "งาน")).check(1) + .filter(wildcard("rc", "*การ*")).check(1) + .filter(wildcard("rc", "การ*")).check(1) + .filter(wildcard("rc", "*การแสดง*")).check(0); + } + + @Test + public void testUAX29URLEmailTokenizerTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new UAX29URLEmailTokenizer())) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "Email me at john.smith@global-international.com") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "john.smith@global-international.com")).check(1) + .filter(match("rc", "Email")).check(1) + .filter(phrase("rc", "me at john.smith@global-international.com")).check(1) + .filter(phrase("rc", "at me")).check(0) + .filter(fuzzy("rc", "john.smith@global-international.com")).check(1) + .filter(fuzzy("rc", "john.smith@global-international.")).check(0) + .filter(contains("rc", "john.smith@global-")).check(0) + .filter(prefix("rc", "john.smith")).check(1) + .filter(wildcard("rc", "*global-international.com*")).check(1) + .filter(wildcard("rc", "*jhon*global-international.com")).check(0); + } + + @Test + public void testUnicodeWhiteSpaceTokenizerTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new UnicodeWhitespaceTokenizer())) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "The")).check(1) + .filter(match("rc", "the")).check(1) + .filter(match("rc", "2")).check(1) + .filter(match("rc", "QUICK")).check(1) + .filter(match("rc", "quick")).check(0) + .filter(match("rc", "Quick")).check(0) + .filter(match("rc", "Brown")).check(0) + .filter(match("rc", "brown")).check(0) + .filter(match("rc", "brOwn")).check(0) + .filter(match("rc", "Foxes")).check(0) + .filter(match("rc", "-Foxes")).check(0) + .filter(match("rc", "Brown-Foxes")).check(1) + .filter(match("rc", "BrownFoxes")).check(0) + .filter(match("rc", "brown-Foxes")).check(0) + .filter(match("rc", "brown-foxes")).check(0) + .filter(match("rc", "jumped")).check(1) + .filter(match("rc", "jump")).check(0) + .filter(match("rc", "dog")).check(1) + .filter(match("rc", "dogs")).check(0) + .filter(match("rc", "and")).check(0) + .filter(match("rc", "or")).check(0) + .filter(phrase("rc", "jumped the")).check(1) + .filter(phrase("rc", "jump the")).check(0) + .filter(phrase("rc", " dog bone. and/or")).check(1) + .filter(fuzzy("rc", "jumped")).check(1) + .filter(fuzzy("rc", "jump")).check(1) + .filter(fuzzy("rc", "jumper")).check(1) + .filter(fuzzy("rc", "ajumper")).check(1) + .filter(fuzzy("rc", "gjumperd")).check(1) + .filter(fuzzy("rc", "dogjumperdog")).check(0) + .filter(contains("rc", "jumped")).check(1) + //TODO: check this behaviour + .filter(contains("rc", "jump")).check(0) + .filter(contains("rc", "jumper")).check(0) + .filter(contains("rc", "ajumped")).check(0) + .filter(prefix("rc", "jump")).check(1) + .filter(prefix("rc", "ju")).check(1) + .filter(regexp("rc", "[j][aeiou]{1}.*")).check(1) + .filter(regexp("rc", "[j][aeiou]{2}.*")).check(0) + .filter(wildcard("rc", "*jumpe*")).check(1) + .filter(wildcard("rc", "*ju*pe*")).check(1) + .filter(wildcard("rc", "*jum*pe*")).check(1) + .filter(wildcard("rc", "jumpe*")).check(1) + .filter(wildcard("rc", "**e*")).check(1) + .filter(wildcard("rc", "and/or*")).check(1) + .filter(wildcard("rc", "*and/or")).check(1) + .filter(wildcard("rc", "and*or")).check(1); + } + + @Test + public void testWhiteSpaceTokenizerTokenizer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new WhitespaceTokenizer())) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/or") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "The")).check(1) + .filter(match("rc", "the")).check(1) + .filter(match("rc", "2")).check(1) + .filter(match("rc", "QUICK")).check(1) + .filter(match("rc", "quick")).check(0) + .filter(match("rc", "Quick")).check(0) + .filter(match("rc", "Brown")).check(0) + .filter(match("rc", "brown")).check(0) + .filter(match("rc", "brOwn")).check(0) + .filter(match("rc", "Foxes")).check(0) + .filter(match("rc", "-Foxes")).check(0) + .filter(match("rc", "Brown-Foxes")).check(1) + .filter(match("rc", "BrownFoxes")).check(0) + .filter(match("rc", "brown-Foxes")).check(0) + .filter(match("rc", "brown-foxes")).check(0) + .filter(match("rc", "jumped")).check(1) + .filter(match("rc", "jump")).check(0) + .filter(match("rc", "dog")).check(1) + .filter(match("rc", "dogs")).check(0) + .filter(match("rc", "and")).check(0) + .filter(match("rc", "or")).check(0) + .filter(phrase("rc", "jumped the")).check(1) + .filter(phrase("rc", "jump the")).check(0) + .filter(phrase("rc", " dog bone. and/or")).check(1) + .filter(fuzzy("rc", "jumped")).check(1) + .filter(fuzzy("rc", "jump")).check(1) + .filter(fuzzy("rc", "jumper")).check(1) + .filter(fuzzy("rc", "ajumper")).check(1) + .filter(fuzzy("rc", "gjumperd")).check(1) + .filter(fuzzy("rc", "dogjumperdog")).check(0) + .filter(contains("rc", "jumped")).check(1) + //TODO: check this behaviour + .filter(contains("rc", "jump")).check(0) + .filter(contains("rc", "jumper")).check(0) + .filter(contains("rc", "ajumped")).check(0) + .filter(prefix("rc", "jump")).check(1) + .filter(prefix("rc", "ju")).check(1) + .filter(regexp("rc", "[j][aeiou]{1}.*")).check(1) + .filter(regexp("rc", "[j][aeiou]{2}.*")).check(0) + .filter(wildcard("rc", "*jumpe*")).check(1) + .filter(wildcard("rc", "*ju*pe*")).check(1) + .filter(wildcard("rc", "*jum*pe*")).check(1) + .filter(wildcard("rc", "jumpe*")).check(1) + .filter(wildcard("rc", "**e*")).check(1) + .filter(wildcard("rc", "and/or*")).check(1) + .filter(wildcard("rc", "*and/or")).check(1) + .filter(wildcard("rc", "and*or")).check(1); + } + + @Test + public void testWikipediaTokenizerTokenizer() { + String test = "[[link]] This is a [[Category:foo]] Category This is a linked [[:Category:bar none withstanding]] " + + "Category This is (parens) This is a [[link]] This is an external URL [http://lucene.apache.org] " + + "Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' " + + " This is a [[link|display info]] This is a period. Here is $3.25 and here is 3.50. Here's Johnny. " + + "==heading== ===sub head=== followed by some text [[Category:blah| ]] " + + "''[[Category:ital_cat]]'' here is some that is ''italics [[Category:foo]] but is never closed." + + "'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this" + + " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]" + + " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] Citation martian code"; + + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new WikipediaTokenizer())) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, test) + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "[http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]")).check(1) + .filter(match("rc", "http://foo.boo.com/test/test/test.html")).check(0) + .filter(match("rc", "''italics''")).check(1) + .filter(match("rc", "italics")).check(1) + .filter(phrase("rc", "===sub head=== followed by some text")).check(1) + .filter(phrase("rc", "sub head followed by some text")).check(1) + .filter(fuzzy("rc", "sub")).check(1) + .filter(fuzzy("rc", "sub head followed by some text")).check(0) + .filter(contains("rc", "===sub head=== followed by some text")).check(1) + .filter(contains("rc", "sub head followed by some text")).check(1); + } +} + From 33b3011db1da2aa378a88db181972c58a897af57 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Tue, 16 May 2017 14:15:41 +0200 Subject: [PATCH 06/40] Add license --- .../analysis/tokenizer/TokenizerBuilderIT.java | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java b/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java index 8d10474be..d3ce7352f 100644 --- a/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java +++ b/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.testsAT.schema.analysis.tokenizer; import com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer.*; From e999d1f99ecba1923706972c2a6a189a0252506e Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Tue, 16 May 2017 14:26:51 +0200 Subject: [PATCH 07/40] Add license in scala files --- .../lucene/schema/analysis/CustomAnalyzer.java | 15 +++++++++++++++ .../tokenizer/EdgeNGramTokenizerBuilder.scala | 15 +++++++++++++++ .../tokenizer/KeywordTokenizerBuilder.scala | 15 +++++++++++++++ .../tokenizer/LetterTokenizerBuilder.scala | 15 +++++++++++++++ .../tokenizer/LowerCaseTokenizerBuilder.scala | 15 +++++++++++++++ .../tokenizer/NGramTokenizerBuilder.scala | 15 +++++++++++++++ .../tokenizer/PathHierarchyTokenizerBuilder.scala | 15 +++++++++++++++ .../tokenizer/PatternTokenizerBuilder.scala | 15 +++++++++++++++ .../ReversePathHierarchyTokenizerBuilder.scala | 15 +++++++++++++++ .../tokenizer/StandardTokenizerBuilder.scala | 15 +++++++++++++++ .../analysis/tokenizer/ThaiTokenizerBuilder.scala | 15 +++++++++++++++ .../analysis/tokenizer/TokenizerBuilder.scala | 15 +++++++++++++++ .../tokenizer/UAX29URLEmailTokenizerBuilder.scala | 15 +++++++++++++++ .../UnicodeWhitespaceTokenizerBuilder.scala | 15 +++++++++++++++ .../tokenizer/WhitespaceTokenizerBuilder.scala | 15 +++++++++++++++ .../tokenizer/WikipediaTokenizerBuilder.scala | 15 +++++++++++++++ 16 files changed, 240 insertions(+) diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java index fb9bc2e3b..344953456 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzer.java @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis; import org.apache.lucene.analysis.Analyzer; diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.scala index d8f08b854..250edc9f4 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import com.fasterxml.jackson.annotation.JsonProperty diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.scala index 28141288a..b787818e1 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import com.fasterxml.jackson.annotation.JsonProperty diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.scala index fdf7973b6..8318b0df2 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import org.apache.lucene.analysis.core.LetterTokenizer diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.scala index 0bb03f94e..adead865d 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.scala index 4a60fd80a..bf457c276 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import com.fasterxml.jackson.annotation.JsonProperty diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.scala index 29876b425..d34501074 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import com.fasterxml.jackson.annotation.JsonProperty diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.scala index 909d10a38..7b27631e5 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import java.util.regex.Pattern diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.scala index bccece833..b35e11ad0 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import com.fasterxml.jackson.annotation.JsonProperty diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.scala index 9ccb7336f..97abf122e 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import com.fasterxml.jackson.annotation.JsonProperty diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.scala index 7039e9887..07fc1a7db 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import org.apache.lucene.analysis.th.ThaiTokenizer diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala index 24a811196..082f0d83d 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import com.fasterxml.jackson.annotation.JsonSubTypes.Type diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.scala index da8764864..befd4054d 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import com.fasterxml.jackson.annotation.JsonProperty diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.scala index 705e7aece..8c39e3799 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.scala index 9e4086678..e12eb3594 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import org.apache.lucene.analysis.core.WhitespaceTokenizer diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.scala index 8e4fb6f9e..4c9d32904 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import com.fasterxml.jackson.annotation.JsonProperty From 567c0df525a1407299ef1977e40157a83ab99c8c Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Tue, 16 May 2017 14:28:21 +0200 Subject: [PATCH 08/40] Add license in test files --- .../analysis/tokenizer/TokenizerBuilderTest.scala | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.scala b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.scala index 3a04cdf62..e3c2d2aff 100644 --- a/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.scala +++ b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.scala @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import com.stratio.cassandra.lucene.BaseScalaTest From 563c76cfbbf4f01f7c89fa81d25d2be60eec771e Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Tue, 16 May 2017 14:31:43 +0200 Subject: [PATCH 09/40] Add license in custom analyzer --- .../index/schema/analysis/CustomAnalyzer.java | 15 +++++++++++++++ .../schema/analysis/tokenizer/TokenizerTest.java | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java index 8056b3d02..75f2f30ea 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.builder.index.schema.analysis; import com.fasterxml.jackson.annotation.JsonCreator; diff --git a/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/TokenizerTest.java b/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/TokenizerTest.java index d3f4f27fd..1f773528f 100644 --- a/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/TokenizerTest.java +++ b/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/TokenizerTest.java @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; import com.google.common.collect.Sets; From 049d826183cf31dc16dbab54a0f8422a8ef7d3e9 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Fri, 2 Jun 2017 12:11:57 +0200 Subject: [PATCH 10/40] Refactor tokenizers --- .../tokenizer/ClassicTokenizerBuilder.scala | 44 --------------- .../tokenizer/EdgeNGramTokenizerBuilder.scala | 44 --------------- .../tokenizer/KeywordTokenizerBuilder.scala | 39 ------------- .../tokenizer/LetterTokenizerBuilder.scala | 34 ----------- .../tokenizer/LowerCaseTokenizerBuilder.scala | 33 ----------- .../tokenizer/NGramTokenizerBuilder.scala | 43 -------------- .../PathHierarchyTokenizerBuilder.scala | 49 ---------------- .../tokenizer/PatternTokenizerBuilder.scala | 48 ---------------- ...ReversePathHierarchyTokenizerBuilder.scala | 49 ---------------- .../tokenizer/StandardTokenizerBuilder.scala | 41 -------------- .../tokenizer/ThaiTokenizerBuilder.scala | 32 ----------- .../analysis/tokenizer/TokenizerBuilder.scala | 56 +++++++------------ .../UAX29URLEmailTokenizerBuilder.scala | 41 -------------- .../UnicodeWhitespaceTokenizerBuilder.scala | 32 ----------- .../WhitespaceTokenizerBuilder.scala | 32 ----------- .../tokenizer/WikipediaTokenizerBuilder.scala | 52 ----------------- 16 files changed, 19 insertions(+), 650 deletions(-) delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.scala delete mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.scala diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.scala deleted file mode 100644 index 73f3ce26a..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.scala +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import com.fasterxml.jackson.annotation.JsonProperty -import org.apache.lucene.analysis.standard.ClassicTokenizer - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.standard.ClassicTokenizer} - * - * @author Juan Pedro Gilaberte jpgilaberte@stratio.com - * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. - */ -case class ClassicTokenizerBuilder(@JsonProperty("max_token_length") maxTokenLength: Integer) extends TokenizerBuilder[ClassicTokenizer] { - - /** - * Gets or creates the Lucene {@link Tokenizer}. - * - * @return the built analyzer - */ - override val function = () => { - val tokenizer = new ClassicTokenizer() - tokenizer.setMaxTokenLength(getOrDefault(Option(maxTokenLength), - ClassicTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH).asInstanceOf[Integer]) - tokenizer - } -} - -object ClassicTokenizerBuilder { - final val DEFAULT_MAX_TOKEN_LENGTH = 250 -} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.scala deleted file mode 100644 index 250edc9f4..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.scala +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import com.fasterxml.jackson.annotation.JsonProperty -import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer - -/** - * A {@link EdgeNGramTokenizer} for building {@link org.apache.lucene.analysis.ngram.EdgeNGramTokenizer} - * - * @author Juan Pedro Gilaberte jpgilaberte@stratio.com - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate - */ -case class EdgeNGramTokenizerBuilder(@JsonProperty("min_gram") final val minGram: Integer, - @JsonProperty("max_gram") final val maxGram: Integer) extends TokenizerBuilder[EdgeNGramTokenizer] { - /** - * Builds a new {@link EdgeNGramTokenizer} using the specified minGram and manGram. - * - */ - override def function = () => { - new EdgeNGramTokenizer(getOrDefault(Option(minGram), EdgeNGramTokenizerBuilder.DEFAULT_MIN_GRAM ).asInstanceOf[Integer], - getOrDefault(Option(maxGram), EdgeNGramTokenizerBuilder.DEFAULT_MAX_GRAM ).asInstanceOf[Integer]) - } - -} - -object EdgeNGramTokenizerBuilder { - final val DEFAULT_MIN_GRAM = 1 - final val DEFAULT_MAX_GRAM = 1 -} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.scala deleted file mode 100644 index b787818e1..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.scala +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import com.fasterxml.jackson.annotation.JsonProperty -import org.apache.lucene.analysis.core.KeywordTokenizer - -/** - * A {@link KeywordTokenizer} for building {@link org.apache.lucene.analysis.core.KeywordTokenizer} - * - * @author Juan Pedro Gilaberte jpgilaberte@stratio.com - * @param bufferSize the terms cache read buffer size - */ -case class KeywordTokenizerBuilder(@JsonProperty("buffer_size") final val bufferSize: Integer = KeywordTokenizerBuilder.DEFAULT_BUFFER_SIZE) extends TokenizerBuilder[KeywordTokenizer] { - /** - * Builds a new {@link KeywordTokenizerBuilder} using the specified buffer_size. - * - */ - override def function: () => KeywordTokenizer = () => new KeywordTokenizer(getOrDefault(Option(bufferSize), KeywordTokenizerBuilder.DEFAULT_BUFFER_SIZE ).asInstanceOf[Integer]) -} - - -object KeywordTokenizerBuilder { - final val DEFAULT_BUFFER_SIZE = 256 -} - diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.scala deleted file mode 100644 index 8318b0df2..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.scala +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import org.apache.lucene.analysis.core.LetterTokenizer - -/** - * A {@link LetterTokenizer} for building {@link org.apache.lucene.analysis.core.LetterTokenizer} - * - * @author Juan Pedro Gilaberte jpgilaberte@stratio.com - */ -case class LetterTokenizerBuilder() extends TokenizerBuilder[LetterTokenizer] { - /** - * Builds a new {@link LetterTokenizer}. - */ - override def function: () => LetterTokenizer = () => new LetterTokenizer -} - -object LetterTokenizerBuilder {} - - diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.scala deleted file mode 100644 index adead865d..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.scala +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - - -import org.apache.lucene.analysis.core.LowerCaseTokenizer - -/** - * A {@link LowerCaseTokenizer} for building {@link org.apache.lucene.analysis.core.LowerCaseTokenizer} - * - * @author Juan Pedro Gilaberte jpgilaberte@stratio.com - */ -case class LowerCaseTokenizerBuilder() extends TokenizerBuilder[LowerCaseTokenizer]{ - /** - * Builds a new {@link LowerCaseTokenizer}. - */ - override def function: () => LowerCaseTokenizer = () => new LowerCaseTokenizer -} - -object LowerCaseTokenizerBuilder {} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.scala deleted file mode 100644 index bf457c276..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.scala +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import com.fasterxml.jackson.annotation.JsonProperty -import org.apache.lucene.analysis.ngram.NGramTokenizer - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.ngram.NGramTokenizer} - * - * @author Juan Pedro Gilaberte jpgilaberte@stratio.com - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate - */ -case class NGramTokenizerBuilder(@JsonProperty("min_gram") final val minGram: Integer, @JsonProperty("max_gram") final val maxGram: Integer) extends TokenizerBuilder[NGramTokenizer] { - - /** - * Builds a new {@link NGramTokenizer} using the specified minGram and manGram. - * - */ - override def function = () => new NGramTokenizer( - getOrDefault(Option(minGram), NGramTokenizerBuilder.DEFAULT_MIN_GRAM).asInstanceOf[Integer], - getOrDefault(Option(maxGram), NGramTokenizerBuilder.DEFAULT_MAX_GRAM).asInstanceOf[Integer]) - -} - -object NGramTokenizerBuilder { - final val DEFAULT_MIN_GRAM = 1 - final val DEFAULT_MAX_GRAM = 2 -} diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.scala deleted file mode 100644 index d34501074..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.scala +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import com.fasterxml.jackson.annotation.JsonProperty -import org.apache.lucene.analysis.path.PathHierarchyTokenizer - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.path.PathHierarchyTokenizer} - * - * @author Juan Pedro Gilaberte jpgilaberte@stratio.com - * @param bufferSize terms cache read buffer size - * @param delimiter path separator - * @param replacement a replacement character for delimiter - * @param skip number of initial tokens to skip - */ -case class PathHierarchyTokenizerBuilder(@JsonProperty("buffer_size") final val bufferSize: Integer, - @JsonProperty("delimiter") final val delimiter: Character, - @JsonProperty("replacement") final val replacement: Character, - @JsonProperty("skip") final val skip: Integer) extends TokenizerBuilder[PathHierarchyTokenizer] { - /** - * Builds a new {@link PathHierarchyTokenizer} using the specified bufferSize, delimiter, replacement and - * skip. - */ - override def function = () => new PathHierarchyTokenizer(getOrDefault(Option(bufferSize), PathHierarchyTokenizerBuilder.DEFAULT_BUFFER_SIZE).asInstanceOf[Integer], - getOrDefault(Option(delimiter), PathHierarchyTokenizerBuilder.DEFAULT_DELIMITER).asInstanceOf[Char], - getOrDefault(Option(replacement), PathHierarchyTokenizerBuilder.DEFAULT_REPLACEMENT).asInstanceOf[Char], - getOrDefault(Option(skip), PathHierarchyTokenizerBuilder.DEFAULT_SKIP).asInstanceOf[Integer]) -} - -object PathHierarchyTokenizerBuilder { - final val DEFAULT_BUFFER_SIZE = 1024 - final val DEFAULT_DELIMITER = '/' - final val DEFAULT_REPLACEMENT = '/' - final val DEFAULT_SKIP = 0 -} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.scala deleted file mode 100644 index 7b27631e5..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.scala +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import java.util.regex.Pattern - -import com.fasterxml.jackson.annotation.JsonProperty -import org.apache.lucene.analysis.pattern.PatternTokenizer - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.pattern.PatternTokenizer} - * - * @author Juan Pedro Gilaberte jpgilaberte@stratio.com>} - * @param pattern if a token length is bigger that this, token is split at max token length intervals. - * @param flags if a token length is bigger that this, token is split at max token length intervals. - * @param group if a token length is bigger that this, token is split at max token length intervals. - */ -case class PatternTokenizerBuilder(@JsonProperty("pattern") final val pattern: String, @JsonProperty( - "flags") final val flags: Integer, @JsonProperty("group") final val group: Integer) extends TokenizerBuilder[PatternTokenizer] { - /** - * Builds a new {@link KeywordTokenizerBuilder} using the specified maxTokenLength. - * - */ - override def function = () => new PatternTokenizer(Pattern.compile( - getOrDefault(Option(pattern), PatternTokenizerBuilder.DEFAULT_PATTERN).asInstanceOf[String], - getOrDefault(Option(flags), PatternTokenizerBuilder.DEFAULT_FLAGS).asInstanceOf[Integer]), - getOrDefault(Option(group), PatternTokenizerBuilder.DEFAULT_GROUP).asInstanceOf[Integer]) -} - - -object PatternTokenizerBuilder { - final val DEFAULT_PATTERN = "\\W+" - final val DEFAULT_FLAGS = 0 - final val DEFAULT_GROUP = -1 -} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.scala deleted file mode 100644 index b35e11ad0..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.scala +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import com.fasterxml.jackson.annotation.JsonProperty -import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer} - * - * @author Juan Pedro Gilaberte jpgilaberte@stratio.com - * @param bufferSize if a token length is bigger that this, token is split at max token length intervals. - * @param delimiter if a token length is bigger that this, token is split at max token length intervals. - * @param replacement if a token length is bigger that this, token is split at max token length intervals. - * @param skip if a token length is bigger that this, token is split at max token length intervals. - */ -case class ReversePathHierarchyTokenizerBuilder(@JsonProperty("buffer_size") final val bufferSize: Integer, @JsonProperty( - "delimiter") final val delimiter: Character, @JsonProperty("replacement") final val replacement: Character, @JsonProperty( - "skip") final val skip: Integer) extends TokenizerBuilder[ReversePathHierarchyTokenizer] { - - /** - * Builds a new {@link ReversePathHierarchyTokenizer} using the specified bufferSize, delimiter, replacement and skip values. - * - */ - override def function = () => new ReversePathHierarchyTokenizer(getOrDefault(Option(bufferSize), ReversePathHierarchyTokenizerBuilder.DEFAULT_BUFFER_SIZE).asInstanceOf[Int], - getOrDefault(Option(delimiter), ReversePathHierarchyTokenizerBuilder.DEFAULT_DELIMITER).asInstanceOf[Char], - getOrDefault(Option(replacement), ReversePathHierarchyTokenizerBuilder.DEFAULT_REPLACEMENT).asInstanceOf[Char], - getOrDefault(Option(skip), ReversePathHierarchyTokenizerBuilder.DEFAULT_SKIP).asInstanceOf[Int]) -} - -object ReversePathHierarchyTokenizerBuilder { - final val DEFAULT_BUFFER_SIZE = 1024 - final val DEFAULT_DELIMITER = '/' - final val DEFAULT_REPLACEMENT = '/' - final val DEFAULT_SKIP = 0 -} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.scala deleted file mode 100644 index 97abf122e..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.scala +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import com.fasterxml.jackson.annotation.JsonProperty -import org.apache.lucene.analysis.standard.StandardTokenizer - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.standard.StandardTokenizer} - * - * @author Juan Pedro Gilaberte jpgilaberte@stratio.com - * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. - */ -case class StandardTokenizerBuilder(@JsonProperty("max_token_length") final val maxTokenLength: Integer) extends TokenizerBuilder[StandardTokenizer] { - /** - * Builds a new {@link KeywordTokenizerBuilder} using the specified maxTokenLength. - * - */ - override def function = () => { - val tokenizer: StandardTokenizer = new StandardTokenizer - tokenizer.setMaxTokenLength(getOrDefault(Option(maxTokenLength), StandardTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH).asInstanceOf[Int]) - tokenizer - } -} - -object StandardTokenizerBuilder { - final val DEFAULT_MAX_TOKEN_LENGTH = 255 -} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.scala deleted file mode 100644 index 07fc1a7db..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.scala +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import org.apache.lucene.analysis.th.ThaiTokenizer - -/** - * A {@link ThaiTokenizer} for building {@link org.apache.lucene.analysis.th.ThaiTokenizer} - * - * @author Juan Pedro Gilaberte jpgilaberte@stratio.com - */ -case class ThaiTokenizerBuilder() extends TokenizerBuilder[ThaiTokenizer] { - /** - * Builds a new {@link ThaiTokenizer}. - */ - override def function = () => new ThaiTokenizer -} - -object ThaiTokenizerBuilder {} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala index 082f0d83d..f652458ac 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala @@ -16,8 +16,10 @@ package com.stratio.cassandra.lucene.schema.analysis.tokenizer import com.fasterxml.jackson.annotation.JsonSubTypes.Type -import com.fasterxml.jackson.annotation.{JsonSubTypes, JsonTypeInfo} -import org.apache.lucene.analysis.Tokenizer +import com.fasterxml.jackson.annotation.{JsonProperty, JsonSubTypes, JsonTypeInfo} +import com.stratio.cassandra.lucene.schema.analysis.Builder +import org.apache.lucene.analysis.util.TokenizerFactory + /** * @author Juan Pedro Gilaberte jpgilaberte@stratio.com @@ -31,45 +33,25 @@ import org.apache.lucene.analysis.Tokenizer new Type(value = classOf[NGramTokenizerBuilder], name = "ngram"), new Type(value = classOf[PathHierarchyTokenizerBuilder], name = "path_hierarchy"), new Type(value = classOf[PatternTokenizerBuilder], name = "pattern"), - new Type(value = classOf[ReversePathHierarchyTokenizerBuilder], name = "reverse_path_hierarchy"), new Type(value = classOf[StandardTokenizerBuilder], name = "standard"), new Type(value = classOf[UAX29URLEmailTokenizerBuilder], name = "uax29_url_email"), - new Type(value = classOf[UnicodeWhitespaceTokenizerBuilder], name = "unicode_whitespace"), new Type(value = classOf[ThaiTokenizerBuilder], name = "thai"), new Type(value = classOf[WhitespaceTokenizerBuilder], name = "whitespace"), new Type(value = classOf[WikipediaTokenizerBuilder], name = "wikipedia")) -) trait TokenizerBuilder[T <: Tokenizer] { - /** - * - * @return - */ - def function : ()=>T - - //TODO: refactor scala style (remove throw) - /** - * - * @param throwable - * @return - */ - def failThrowException(throwable: Throwable) = throw throwable - - /** - * Gets or creates the Lucene {@link Tokenizer}. - * - * @return the built analyzer - */ - def buildTokenizer: T = { - import scala.util.control.Exception._ - //TODO: refactor scala style (manage either in other level) - catching(classOf[Exception]).either(function()).asInstanceOf[Either[Exception, T]].fold(failThrowException, x=>x) - } - - /** - * @param param the main parameter. - * @param defaultParam the default parameter if main paramaeter is null. - * @return if (param!=null) { return param; }else{ return defaultParam; } - */ - def getOrDefault(param: Option[Any], defaultParam: Any): Any = param.map(x => x).getOrElse(defaultParam) +) sealed abstract class TokenizerBuilder[T](typeBuilder: String) extends Builder[T]{ + def buildFunction = () => TokenizerFactory.forName(typeBuilder, mapParsed).asInstanceOf[T] } -object TokenizerBuilder{} \ No newline at end of file +final case class ClassicTokenizerBuilder(@JsonProperty("max_token_length") maxTokenLength: Integer) extends TokenizerBuilder[TokenizerFactory]("classic") +final case class EdgeNGramTokenizerBuilder(@JsonProperty("min_gram_size") minGramSize: Integer, @JsonProperty("max_gram_size") maxGramSize: Integer) extends TokenizerBuilder[TokenizerFactory]("edgengram") +final case class KeywordTokenizerBuilder() extends TokenizerBuilder[TokenizerFactory]("keyword") +final case class LetterTokenizerBuilder() extends TokenizerBuilder[TokenizerFactory]("letter") +final case class LowerCaseTokenizerBuilder() extends TokenizerBuilder[TokenizerFactory]("lowercase") +final case class NGramTokenizerBuilder(@JsonProperty("min_gram_size") minGramSize: Integer, @JsonProperty("max_gram_size") maxGramSize: Integer) extends TokenizerBuilder[TokenizerFactory]("ngram") +final case class PathHierarchyTokenizerBuilder(@JsonProperty("reverse") reverse: Boolean, @JsonProperty("delimiter") delimiter: Char, @JsonProperty("replace") replace: Char, @JsonProperty("skip") skip: Integer) extends TokenizerBuilder[TokenizerFactory]("pathhierarchy") +final case class PatternTokenizerBuilder(@JsonProperty("pattern") pattern: String, @JsonProperty("group") group: Integer) extends TokenizerBuilder[TokenizerFactory]("pattern") +final case class StandardTokenizerBuilder(@JsonProperty("max_token_length") maxTokenLength: Integer) extends TokenizerBuilder[TokenizerFactory]("standard") +final case class ThaiTokenizerBuilder() extends TokenizerBuilder[TokenizerFactory]("thai") +final case class UAX29URLEmailTokenizerBuilder(@JsonProperty("max_token_length") maxTokenLength: Integer) extends TokenizerBuilder[TokenizerFactory]("uax29urlemail") +final case class WhitespaceTokenizerBuilder(@JsonProperty("rule") rule: String) extends TokenizerBuilder[TokenizerFactory]("whitespace") +final case class WikipediaTokenizerBuilder() extends TokenizerBuilder[TokenizerFactory]("wikipedia") diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.scala deleted file mode 100644 index befd4054d..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.scala +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import com.fasterxml.jackson.annotation.JsonProperty -import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer} - * - * @author Eduardo Alonso eduardoalonso@stratio.com - * @param maxTokenLength if a token length is bigger that this, token is split at max token length intervals. - */ -case class UAX29URLEmailTokenizerBuilder(@JsonProperty("max_token_length") final val maxTokenLength: Integer) extends TokenizerBuilder[UAX29URLEmailTokenizer] { - /** - * Builds a new {@link UAX29URLEmailTokenizer} using the specified maxTokenLength. - * - */ - override def function = () => { - val tokenizer: UAX29URLEmailTokenizer = new UAX29URLEmailTokenizer - tokenizer.setMaxTokenLength(getOrDefault(Option(maxTokenLength), UAX29URLEmailTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGHT).asInstanceOf[Int]) - tokenizer - } -} - -object UAX29URLEmailTokenizerBuilder { - final val DEFAULT_MAX_TOKEN_LENGHT = 255 -} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.scala deleted file mode 100644 index 8c39e3799..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.scala +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer} - * - * @author Eduardo Alonso eduardoalonso@stratio.com - */ -case class UnicodeWhitespaceTokenizerBuilder() extends TokenizerBuilder[UnicodeWhitespaceTokenizer] { - /** - * Builds a new {@link UnicodeWhitespaceTokenizer}. - */ - override def function = () => new UnicodeWhitespaceTokenizer -} - -object UnicodeWhitespaceTokenizerBuilder {} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.scala deleted file mode 100644 index e12eb3594..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.scala +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import org.apache.lucene.analysis.core.WhitespaceTokenizer - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.core.WhitespaceTokenizer} - * - * @author Eduardo Alonso duardoalonso@stratio.com - */ -case class WhitespaceTokenizerBuilder() extends TokenizerBuilder[WhitespaceTokenizer]{ - /** - * Builds a new {@link WhitespaceTokenizer}. - */ - override def function = () => new WhitespaceTokenizer -} - -object WhitespaceTokenizerBuilder {} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.scala deleted file mode 100644 index 4c9d32904..000000000 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.scala +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer - -import com.fasterxml.jackson.annotation.JsonProperty -import com.google.common.collect.Sets -import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer - -import scala.collection.immutable.ListSet - -/** - * A {@link TokenizerBuilder} for building {@link org.apache.lucene.analysis.wikipedia.WikipediaTokenizer} - * - * @author Eduardo Alonso eduardoalonso@stratio.com - * @param tokenOutput this tokenizer output, only untokenized, only tokens or both - * @param untokenizedTypes //TODO - */ -case class WikipediaTokenizerBuilder(@JsonProperty("token_output") tokenOutput: String, - @JsonProperty("untokenized_types") final val untokenizedTypes: Array[String]) extends TokenizerBuilder[WikipediaTokenizer]{ - /** - * Builds a new {@link WikipediaTokenizerBuilder} using the specified tokenOutput and untokenizedTypes. - * - */ - override def function = () => { - import collection.JavaConverters._ - new WikipediaTokenizer(getOrDefault(Option(TokenOutputEnum.withName(tokenOutput).id), TokenOutputEnum.TOKENS_ONLY).asInstanceOf[Int], untokenizedTypes.toSet.asJava) - } -} - -object WikipediaTokenizer { - final val DEFAULT_TOKEN_OUTPUT = TokenOutputEnum.TOKENS_ONLY -} - -object TokenOutputEnum extends Enumeration{ - type TokenOutputValue = Value - val TOKENS_ONLY = Value("TOKENS_ONLY") - val UNTOKENIZED_ONLY = Value("UNTOKENIZED_ONLY") - val BOTH = Value("BOTH") -} From 048a1dfea90541590c6cf0127de3db2d0cb03d1f Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Fri, 2 Jun 2017 12:12:40 +0200 Subject: [PATCH 11/40] Add charFilters --- .../charFilter/CharFilterBuilder.scala | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilder.scala diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilder.scala new file mode 100644 index 000000000..4e6f3e9e8 --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilder.scala @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.charFilter + +import com.fasterxml.jackson.annotation.JsonSubTypes.Type +import com.fasterxml.jackson.annotation.{JsonProperty, JsonSubTypes, JsonTypeInfo} +import com.stratio.cassandra.lucene.schema.analysis.Builder +import org.apache.lucene.analysis.util.CharFilterFactory + + +/** + * Base utility class for implementing a {@link BaseCharFilter}. + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + */ +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = CharFilterBuilder.TYPE) +@JsonSubTypes(Array(new Type(value = classOf[HtmlStripCharFilterBuilder], name = CharFilterBuilder.HTML_STRIP), + new Type(value = classOf[MappingCharFilterBuilder], name = CharFilterBuilder.MAPPING), + new Type(value = classOf[PersianCharFilterBuilder], name = CharFilterBuilder.PERSIAN), + new Type(value = classOf[PatternReplaceCharFilterBuilder], name = CharFilterBuilder.PATTERN_REPLACE))) +sealed abstract class CharFilterBuilder[T](typeBuilder: String) extends Builder[T]{ + def buildFunction = () => CharFilterFactory.forName(typeBuilder, mapParsed).asInstanceOf[T] +} + +final case class HtmlStripCharFilterBuilder() extends CharFilterBuilder[CharFilterFactory](CharFilterBuilder.HTML_STRIP) +final case class PersianCharFilterBuilder() extends CharFilterBuilder[CharFilterFactory](CharFilterBuilder.PERSIAN) +final case class PatternReplaceCharFilterBuilder(@JsonProperty(CharFilterBuilder.PATTERN) pattern: String, @JsonProperty(CharFilterBuilder.REPLACEMENT) replacement:String) extends CharFilterBuilder[CharFilterFactory](CharFilterBuilder.PATTERN_REPLACE) +final case class MappingCharFilterBuilder(@JsonProperty(CharFilterBuilder.MAPPINGS) mapping: String) extends CharFilterBuilder[CharFilterFactory](CharFilterBuilder.MAPPING){ + +} + +object CharFilterBuilder{ + final val MAPPINGS = "mapping" + final val TYPE = "type" + final val PATTERN = "pattern" + final val HTML_STRIP = "htmlstrip" + final val MAPPING = "mapping" + final val PERSIAN = "persian" + final val PATTERN_REPLACE = "patternreplace" + final val REPLACEMENT = "replacement" +} From 1645cf0fcc3babcff4d0170b6ab313f5d88a3539 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Fri, 2 Jun 2017 12:12:57 +0200 Subject: [PATCH 12/40] Add tokenFilter --- .../tokenFilter/TokenFilterBuilder.scala | 158 ++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala new file mode 100644 index 000000000..edab7c71c --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenFilter + +import com.fasterxml.jackson.annotation.JsonSubTypes.Type +import com.fasterxml.jackson.annotation.{JsonProperty, JsonSubTypes, JsonTypeInfo} +import com.stratio.cassandra.lucene.schema.analysis.Builder +import org.apache.lucene.analysis.util.TokenFilterFactory + + +/** + * Base utility class for implementing a {@link BaseCharFilter}. + * + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + */ +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type") +@JsonSubTypes(Array(new Type(value = classOf[StandardTokenFilterBuilder], name = "standard"), + new Type(value = classOf[ApostropheTokenFilterBuilder], name = "apostrophe"), + new Type(value = classOf[ArabicNormalizationTokenFilterBuilder], name = "arabicnormalization"), + new Type(value = classOf[ArabicStemTokenFilterBuilder], name = "arabicstem"), + new Type(value = classOf[SoraniNormalizationTokenFilterBuilder], name = "soraninormalization"), + new Type(value = classOf[IndicNormalizationTokenFilterBuilder], name = "indicnormalization"), + new Type(value = classOf[PortugueseStemTokenFilterBuilder], name = "portuguesestem"), + new Type(value = classOf[GermanMinimalStemTokenFilterBuilder], name = "germanminimalstem"), + new Type(value = classOf[UpperCaseTokenFilterBuilder], name = "uppercase"), + new Type(value = classOf[KeywordRepeatTokenFilterBuilder], name = "keywordrepeat"), + new Type(value = classOf[ClassicTokenFilterBuilder], name = "classic"), + new Type(value = classOf[ShingleTokenFilterBuilder], name = "shingle"), + new Type(value = classOf[StemmeroverrideTokenFilterBuilder], name = "stemmeroverride"), + new Type(value = classOf[BulgarianstemTokenFilterBuilder], name = "bulgarianstem"), + new Type(value = classOf[SwedishlightstemTokenFilterBuilder], name = "swedishlightstem"), + new Type(value = classOf[FrenchlightstemTokenFilterBuilder], name = "frenchlightstem"), + new Type(value = classOf[CjkwidthTokenFilterBuilder], name = "cjkwidth"), + new Type(value = classOf[GreekstemTokenFilterBuilder], name = "greekstem"), + new Type(value = classOf[StopTokenFilterBuilder], name = "stop"), + new Type(value = classOf[HindistemTokenFilterBuilder], name = "hindistem"), + new Type(value = classOf[FingerprintTokenFilterBuilder], name = "fingerprint"), + new Type(value = classOf[SpanishlightstemTokenFilterBuilder], name = "spanishlightstem"), + new Type(value = classOf[HungarianlightstemTokenFilterBuilder], name = "hungarianlightstem"), + new Type(value = classOf[NorwegianminimalstemTokenFilterBuilder], name = "norwegianminimalstem"), + new Type(value = classOf[PersiannormalizationTokenFilterBuilder], name = "persiannormalization"), + new Type(value = classOf[GermanlightstemTokenFilterBuilder], name = "germanlightstem"), + new Type(value = classOf[TypeTokenFilterBuilder], name = "type"), + new Type(value = classOf[AsciifoldingTokenFilter], name = "asciifolding"), + new Type(value = classOf[LowercaseTokenFilter], name = "lowercase") +)) +sealed abstract class TokenFilterBuilder[T](typeBuilder: String) extends Builder[T]{ + def buildFunction = () => TokenFilterFactory.forName(typeBuilder, mapParsed).asInstanceOf[T] +} + +final case class StandardTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("standard") +final case class ApostropheTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("apostrophe") +final case class ArabicNormalizationTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("arabicnormalization") +final case class ArabicStemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("arabicstem") +final case class SoraniNormalizationTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("soraninormalization") +final case class IndicNormalizationTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("indicnormalization") +final case class PortugueseStemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("portuguesestem") +final case class GermanMinimalStemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("germanminimalstem") +final case class UpperCaseTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("uppercase") +final case class KeywordRepeatTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("keywordrepeat") +final case class ClassicTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("classic") +final case class ShingleTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("shingle") +final case class StemmeroverrideTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("stemmeroverride") +final case class BulgarianstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("bulgarianstem") +final case class SwedishlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("swedishlightstem") +final case class FrenchlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("frenchlightstem") +final case class CjkwidthTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("cjkwidth") +final case class GreekstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("greekstem") +final case class StopTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("stop") +final case class HindistemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("hindistem") +final case class FingerprintTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("fingerprint") +final case class SpanishlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("spanishlightstem") +final case class HungarianlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("hungarianlightstem") +final case class NorwegianminimalstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("norwegianminimalstem") +final case class PersiannormalizationTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("persiannormalization") +final case class GermanlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("germanlightstem") +final case class TypeTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("type") +// +final case class AsciifoldingTokenFilter(@JsonProperty("preserveOriginal") preserveOriginal:Boolean) extends TokenFilterBuilder[TokenFilterFactory]("asciifolding") +final case class LowercaseTokenFilter() extends TokenFilterBuilder[TokenFilterFactory]("lowercase") +// +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("germanstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("ngram") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("limittokenposition") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("greeklowercase") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("standard") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("limittokenoffset") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("snowballporter") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("typeaspayload") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("patternreplace") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("cjkbigram") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("keywordmarker") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("soranistem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("elision") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("hunspellstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("codepointcount") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("czechstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("turkishlowercase") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("daterecognizer") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("portugueselightstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("irishlowercase") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("commongramsquery") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("numericpayload") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("scandinavianfolding") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("germannormalization") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("delimitedpayload") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("worddelimiter") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("portugueseminimalstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("removeduplicates") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("edgengram") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("latvianstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("porterstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("finnishlightstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("commongrams") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("galicianstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("kstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("reversestring") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("asciifolding") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("norwegianlightstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("trim") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("length") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("decimaldigit") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("brazilianstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("capitalization") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("serbiannormalization") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("frenchminimalstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("englishminimalstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("limittokencount") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("hyphenatedwords") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("truncate") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("tokenoffsetpayload") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("galicianminimalstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("russianlightstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("englishpossessive") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("lowercase") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("hindinormalization") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("scandinaviannormalization") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("thaiword") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("synonym") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("indonesianstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("keepword") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("hyphenationcompoundword") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("dictionarycompoundword") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("italianlightstem") +//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("patterncapturegroup") From 5ec11fdb54e136ca2019fad84d9db260d3ca21c2 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Fri, 2 Jun 2017 12:14:02 +0200 Subject: [PATCH 13/40] Add builder objects --- .../cassandra/lucene/builder/Builder.java | 22 ++- .../index/schema/analysis/CustomAnalyzer.java | 16 +- .../analysis/charFilter/CharFilter.java | 32 ++++ .../charFilter/HtmlStripCharFilter.java | 26 +++ .../MappingCharFilter.java} | 20 +- .../charFilter/PatternCharFilter.java | 37 ++++ .../charFilter/PersianCharFilter.java | 26 +++ .../tokenFilter/ApostropheTokenFilter.java | 27 +++ .../tokenFilter/AsciifoldingTokenFilter.java | 38 ++++ .../tokenFilter/LowercaseTokenFilter.java | 26 +++ .../analysis/tokenFilter/TokenFilter.java | 31 ++++ .../tokenizer/EdgeNGramTokenizer.java | 24 +-- .../analysis/tokenizer/KeywordTokenizer.java | 16 -- .../analysis/tokenizer/NGramTokenizer.java | 28 +-- .../tokenizer/PathHierarchyTokenizer.java | 26 +-- .../analysis/tokenizer/PatternTokenizer.java | 7 - .../ReversePathHierarchyTokenizer.java | 82 --------- .../schema/analysis/tokenizer/Tokenizer.java | 2 - .../tokenizer/WhitespaceTokenizer.java | 16 +- .../tokenizer/WikipediaTokenizer.java | 144 ++++++++------- .../tokenizer/CustomAnalyzertest.java | 76 ++++++++ .../analysis/tokenizer/TokenizerTest.java | 171 +++++------------- 22 files changed, 538 insertions(+), 355 deletions(-) create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/CharFilter.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/HtmlStripCharFilter.java rename builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/{tokenizer/UnicodeWhitespaceTokenizer.java => charFilter/MappingCharFilter.java} (66%) create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/PatternCharFilter.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/PersianCharFilter.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/ApostropheTokenFilter.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/AsciifoldingTokenFilter.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/LowercaseTokenFilter.java create mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/TokenFilter.java delete mode 100644 builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ReversePathHierarchyTokenizer.java create mode 100644 builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/CustomAnalyzertest.java diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java index 7ff7134df..fe0f905ca 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java @@ -23,6 +23,8 @@ import com.stratio.cassandra.lucene.builder.index.schema.analysis.ClasspathAnalyzer; import com.stratio.cassandra.lucene.builder.index.schema.analysis.CustomAnalyzer; import com.stratio.cassandra.lucene.builder.index.schema.analysis.SnowballAnalyzer; +import com.stratio.cassandra.lucene.builder.index.schema.analysis.charFilter.CharFilter; +import com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenFilter.TokenFilter; import com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer.Tokenizer; import com.stratio.cassandra.lucene.builder.index.schema.mapping.*; import com.stratio.cassandra.lucene.builder.search.Search; @@ -256,13 +258,27 @@ public static SnowballAnalyzer snowballAnalyzer(String language) { } /** - * Returns a new {@link SnowballAnalyzer} for the specified language and stopwords. + * Returns a new {@link CustomAnalyzer} for the specified language and stopwords. * * @param tokenizer - * @return a new custom analyzer + * @param charFilter + * @param tokenFiliter + * @return */ + public static CustomAnalyzer customAnalyzer(Tokenizer tokenizer, CharFilter[] charFilter, TokenFilter[] tokenFiliter) { + return new CustomAnalyzer(tokenizer, charFilter, tokenFiliter); + } + public static CustomAnalyzer customAnalyzer(Tokenizer tokenizer) { - return new CustomAnalyzer(tokenizer); + return new CustomAnalyzer(tokenizer, null, null); + } + + public static CustomAnalyzer customAnalyzer(Tokenizer tokenizer, CharFilter[] charFilter) { + return new CustomAnalyzer(tokenizer, charFilter, null); + } + + public static CustomAnalyzer customAnalyzer(Tokenizer tokenizer, TokenFilter[] tokenFiliter) { + return new CustomAnalyzer(tokenizer, null, tokenFiliter); } /** diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java index 75f2f30ea..55840a988 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java @@ -17,8 +17,11 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; +import com.stratio.cassandra.lucene.builder.index.schema.analysis.charFilter.CharFilter; +import com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenFilter.TokenFilter; import com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer.Tokenizer; + /** * {@link Analyzer} using a Lucene's {@code Analyzer}s in classpath. * @@ -28,16 +31,25 @@ */ public class CustomAnalyzer extends Analyzer{ + @JsonProperty("token_filter") + private final TokenFilter[] tokenFilter; + + @JsonProperty("char_filter") + private final CharFilter[] charFilter; + @JsonProperty("tokenizer") private final Tokenizer tokenizer; - /** * Builds a new {@link CustomAnalyzer} using custom tokenizer, char_filters and token_filters. * * @param tokenizer an {@link Tokenizer} the tookenizer to use. */ @JsonCreator - public CustomAnalyzer(@JsonProperty("tokenizer") Tokenizer tokenizer) { + public CustomAnalyzer(@JsonProperty("tokenizer") Tokenizer tokenizer, @JsonProperty("char_filter") CharFilter[] charFilter, + @JsonProperty("token_filter") TokenFilter[] tokenFilter) + { this.tokenizer = tokenizer; + this.charFilter = charFilter; + this.tokenFilter = tokenFilter; } } diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/CharFilter.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/CharFilter.java new file mode 100644 index 000000000..65f852f0a --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/CharFilter.java @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.charFilter; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import com.stratio.cassandra.lucene.builder.JSONBuilder; + +/** + * Created by jpgilaberte on 25/05/17. + */ +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type") +@JsonSubTypes({@JsonSubTypes.Type(value = MappingCharFilter.class, name = "mapping"), + @JsonSubTypes.Type(value = HtmlStripCharFilter.class, name = "htmlstrip"), + @JsonSubTypes.Type(value = PatternCharFilter.class, name = "pattern"), + @JsonSubTypes.Type(value = PersianCharFilter.class, name = "persian")}) +public class CharFilter extends JSONBuilder{ + +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/HtmlStripCharFilter.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/HtmlStripCharFilter.java new file mode 100644 index 000000000..154c111cd --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/HtmlStripCharFilter.java @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.charFilter; + +import com.fasterxml.jackson.annotation.JsonCreator; + +/** + * Created by jpgilaberte on 30/05/17. + */ +public class HtmlStripCharFilter extends CharFilter{ + @JsonCreator + public HtmlStripCharFilter(){} +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UnicodeWhitespaceTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/MappingCharFilter.java similarity index 66% rename from builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UnicodeWhitespaceTokenizer.java rename to builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/MappingCharFilter.java index 1f5c36a2f..ed9be8dae 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/UnicodeWhitespaceTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/MappingCharFilter.java @@ -13,23 +13,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; +package com.stratio.cassandra.lucene.builder.index.schema.analysis.charFilter; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; /** - * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. - * - * It's uses the {@code Tokenizer}'s default (no args) constructor. - * - * @author Juan Pedro Gilaberte {@literal } + * Created by jpgilaberte on 25/05/17. */ -public class UnicodeWhitespaceTokenizer extends Tokenizer { +public class MappingCharFilter extends CharFilter{ + + @JsonProperty("mapping") + private final String mapping; - /** - * Builds a new {@link LetterTokenizer}. - */ @JsonCreator - public UnicodeWhitespaceTokenizer() { + public MappingCharFilter( @JsonProperty("mapping") String mapping){ + this.mapping = mapping; } } diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/PatternCharFilter.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/PatternCharFilter.java new file mode 100644 index 000000000..f3fb4a34a --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/PatternCharFilter.java @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.charFilter; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * Created by jpgilaberte on 30/05/17. + */ +public class PatternCharFilter extends CharFilter{ + + @JsonProperty("pattern") + final String pattern; + + @JsonProperty("replacement") + final String replacement; + + @JsonCreator + public PatternCharFilter(@JsonProperty("pattern") String pattern, @JsonProperty("replacement") String replacement){ + this.pattern = pattern; + this.replacement = replacement; + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/PersianCharFilter.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/PersianCharFilter.java new file mode 100644 index 000000000..911479679 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/PersianCharFilter.java @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.charFilter; + +import com.fasterxml.jackson.annotation.JsonCreator; + +/** + * Created by jpgilaberte on 30/05/17. + */ +public class PersianCharFilter extends CharFilter{ + @JsonCreator + public PersianCharFilter(){} +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/ApostropheTokenFilter.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/ApostropheTokenFilter.java new file mode 100644 index 000000000..e63738c99 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/ApostropheTokenFilter.java @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenFilter; + +import com.fasterxml.jackson.annotation.JsonCreator; + +/** + * Created by jpgilaberte on 25/05/17. + */ +public class ApostropheTokenFilter extends TokenFilter{ + + @JsonCreator + public ApostropheTokenFilter(){} +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/AsciifoldingTokenFilter.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/AsciifoldingTokenFilter.java new file mode 100644 index 000000000..a55b43a0a --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/AsciifoldingTokenFilter.java @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenFilter; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * Created by jpgilaberte on 30/05/17. + */ +public class AsciifoldingTokenFilter extends TokenFilter{ + + @JsonProperty("preserveOriginal") + private final Boolean preserveOriginal; + + @JsonCreator + public AsciifoldingTokenFilter(){ + this.preserveOriginal = false; + } + + @JsonCreator + public AsciifoldingTokenFilter(@JsonProperty("preserveOriginal") Boolean preserveOriginal){ + this.preserveOriginal = preserveOriginal; + } +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/LowercaseTokenFilter.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/LowercaseTokenFilter.java new file mode 100644 index 000000000..924d9b563 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/LowercaseTokenFilter.java @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenFilter; + +import com.fasterxml.jackson.annotation.JsonCreator; + +/** + * Created by jpgilaberte on 30/05/17. + */ +public class LowercaseTokenFilter extends TokenFilter{ + @JsonCreator + public LowercaseTokenFilter(){} +} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/TokenFilter.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/TokenFilter.java new file mode 100644 index 000000000..9a657be58 --- /dev/null +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenFilter/TokenFilter.java @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenFilter; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import com.stratio.cassandra.lucene.builder.JSONBuilder; + + +/** + * Created by jpgilaberte on 25/05/17. + */ +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type") +@JsonSubTypes({@JsonSubTypes.Type(value = ApostropheTokenFilter.class, name = "apostrophe"), + @JsonSubTypes.Type(value = AsciifoldingTokenFilter.class, name = "asciifolding"), + @JsonSubTypes.Type(value = LowercaseTokenFilter.class, name = "lowercase")}) +public class TokenFilter extends JSONBuilder{ +} \ No newline at end of file diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/EdgeNGramTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/EdgeNGramTokenizer.java index cd85e0bc1..016d4a052 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/EdgeNGramTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/EdgeNGramTokenizer.java @@ -30,32 +30,32 @@ public class EdgeNGramTokenizer extends Tokenizer { static final Integer DEFAULT_MAX_GRAM = 2; /** the smallest n-gram to generate */ - @JsonProperty("min_gram") - final Integer minGram; + @JsonProperty("min_gram_size") + final Integer minGramSize; /** the largest n-gram to generate */ - @JsonProperty("max_gram") - final Integer maxGram; + @JsonProperty("max_gram_size") + final Integer maxGramSize; /** - * Builds a new {@link EdgeNGramTokenizer} using the default minGram and manGram. + * Builds a new {@link EdgeNGramTokenizer} using the default minGramSize and manGram. */ @JsonCreator public EdgeNGramTokenizer() { - this.minGram = DEFAULT_MIN_GRAM; - this.maxGram = DEFAULT_MAX_GRAM; + this.minGramSize = DEFAULT_MIN_GRAM; + this.maxGramSize = DEFAULT_MAX_GRAM; } /** - * Builds a new {@link EdgeNGramTokenizer} using the specified minGram and manGram. + * Builds a new {@link EdgeNGramTokenizer} using the specified minGramSize and manGram. * * @param minGram the smallest n-gram to generate * @param minGram the largest n-gram to generate */ @JsonCreator - public EdgeNGramTokenizer(@JsonProperty("min_gram") Integer minGram, - @JsonProperty("max_gram") Integer maxGram) { - this.minGram = getOrDefault(minGram, DEFAULT_MIN_GRAM); - this.maxGram = getOrDefault(maxGram, DEFAULT_MAX_GRAM); + public EdgeNGramTokenizer(@JsonProperty("min_gram_size") Integer minGram, + @JsonProperty("max_gram_size") Integer maxGram) { + this.minGramSize = getOrDefault(minGram, DEFAULT_MIN_GRAM); + this.maxGramSize = getOrDefault(maxGram, DEFAULT_MAX_GRAM); } } diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java index dc4e563d8..111d69d10 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java @@ -27,24 +27,8 @@ */ public class KeywordTokenizer extends Tokenizer { - static final Integer DEFAULT_BUFFER_SIZE = 256; - - /** terms cache read buffer size */ - @JsonProperty("buffer_size") - final Integer bufferSize; - @JsonCreator public KeywordTokenizer() { - this.bufferSize = DEFAULT_BUFFER_SIZE; } - /** - * Builds a new {@link KeywordTokenizer} using the specified buffer_size. - * - * @param bufferSize the terms cache read buffer size - */ - @JsonCreator - public KeywordTokenizer(@JsonProperty("buffer_size") Integer bufferSize) { - this.bufferSize = getOrDefault(bufferSize, DEFAULT_BUFFER_SIZE); - } } diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java index 004e44c09..ebcf11875 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java @@ -31,33 +31,33 @@ public class NGramTokenizer extends Tokenizer { static final Integer DEFAULT_MAX_GRAM = 2; /** the smallest n-gram to generate */ - @JsonProperty("min_gram") - final Integer minGram; + @JsonProperty("min_gram_size") + final Integer minGramSize; /** the largest n-gram to generate */ - @JsonProperty("max_gram") - final Integer maxGram; + @JsonProperty("max_gram_size") + final Integer maxGramSize; /** - * Builds a new {@link NGramTokenizer} using the default minGram and manGram. + * Builds a new {@link NGramTokenizer} using the default minGramSize and manGram. * */ @JsonCreator public NGramTokenizer() { - this.minGram = DEFAULT_MIN_GRAM; - this.maxGram = DEFAULT_MAX_GRAM; + this.minGramSize = DEFAULT_MIN_GRAM; + this.maxGramSize = DEFAULT_MAX_GRAM; } /** - * Builds a new {@link NGramTokenizer} using the specified minGram and manGram. + * Builds a new {@link NGramTokenizer} using the specified minGramSize and manGram. * - * @param minGram the smallest n-gram to generate - * @param minGram the largest n-gram to generate + * @param gramMinSize the smallest n-gram to generate + * @param gramMaxSize the largest n-gram to generate */ @JsonCreator - public NGramTokenizer(@JsonProperty("min_gram") Integer minGram, - @JsonProperty("max_gram") Integer maxGram) { - this.minGram = getOrDefault(minGram, DEFAULT_MIN_GRAM); - this.maxGram = getOrDefault(maxGram, DEFAULT_MAX_GRAM); + public NGramTokenizer(@JsonProperty("min_gram_size") Integer minGramSize, + @JsonProperty("max_gram_size") Integer maxGramSize) { + this.minGramSize = getOrDefault(minGramSize, DEFAULT_MIN_GRAM); + this.maxGramSize = getOrDefault(maxGramSize, DEFAULT_MAX_GRAM); } } diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java index 67c4f26ae..90c45198b 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java @@ -27,22 +27,22 @@ */ public class PathHierarchyTokenizer extends Tokenizer { - static final Integer DEFAULT_BUFFER_SIZE = 1024; + static final Boolean REVERSE = false; static final Character DEFAULT_DELIMITER = '/'; static final Character DEFAULT_REPLACEMENT = '/'; static final Integer DEFAULT_SKIP = 0; /** terms cache read buffer size */ - @JsonProperty("buffer_size") - final Integer bufferSize; + @JsonProperty("reverse") + final Boolean reverse; /** path separator */ @JsonProperty("delimiter") final Character delimiter; /** a replacement character for delimiter */ - @JsonProperty("replacement") - final Character replacement; + @JsonProperty("replace") + final Character replace; /** number of initial tokens to skip */ @JsonProperty("skip") @@ -53,28 +53,28 @@ public class PathHierarchyTokenizer extends Tokenizer { */ @JsonCreator public PathHierarchyTokenizer() { - this.bufferSize = DEFAULT_BUFFER_SIZE; + this.reverse = REVERSE; this.delimiter = DEFAULT_DELIMITER; - this.replacement = DEFAULT_REPLACEMENT; + this.replace = DEFAULT_REPLACEMENT; this.skip = DEFAULT_SKIP; } /** * Builds a new {@link PathHierarchyTokenizer} using the default bufferSize, delimiter, replacement and skip. * - * @param bufferSize terms cache read buffer size + * @param reverse terms cache read buffer size * @param delimiter path separator - * @param replacement a replacement character for delimiter + * @param replace a replacement character for delimiter * @param skip number of initial tokens to skip */ @JsonCreator - public PathHierarchyTokenizer(@JsonProperty("buffer_size") Integer bufferSize, + public PathHierarchyTokenizer(@JsonProperty("reverse") Boolean reverse, @JsonProperty("delimiter") Character delimiter, - @JsonProperty("replacement") Character replacement, + @JsonProperty("replace") Character replacement, @JsonProperty("skip") Integer skip) { - this.bufferSize = getOrDefault(bufferSize, DEFAULT_BUFFER_SIZE); + this.reverse = getOrDefault(reverse, REVERSE); this.delimiter = getOrDefault(delimiter, DEFAULT_DELIMITER); - this.replacement = getOrDefault(replacement, DEFAULT_REPLACEMENT); + this.replace = getOrDefault(replacement, DEFAULT_REPLACEMENT); this.skip = getOrDefault(skip, DEFAULT_SKIP); } } diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java index d8d1fc7ca..fbec32c8d 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java @@ -28,17 +28,12 @@ public class PatternTokenizer extends Tokenizer { static final String DEFAULT_PATTERN = "\\W+"; - static final Integer DEFAULT_FLAGS = 0; static final Integer DEFAULT_GROUP = -1; /** java regular expression */ @JsonProperty("pattern") final String pattern; - /** java regular expression flags */ - @JsonProperty("flags") - final Integer flags; - /** which pattern group to use to generate tokens (-1 for split) */ @JsonProperty("group") final Integer group; @@ -49,7 +44,6 @@ public class PatternTokenizer extends Tokenizer { @JsonCreator public PatternTokenizer() { this.pattern = DEFAULT_PATTERN; - this.flags = DEFAULT_FLAGS; this.group = DEFAULT_GROUP; } @@ -65,7 +59,6 @@ public PatternTokenizer(@JsonProperty("pattern") String pattern, @JsonProperty("flags") Integer flags, @JsonProperty("group") Integer group) { this.pattern = getOrDefault(pattern, DEFAULT_PATTERN); - this.flags = getOrDefault(flags, DEFAULT_FLAGS); this.group = getOrDefault(group, DEFAULT_GROUP); } } diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ReversePathHierarchyTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ReversePathHierarchyTokenizer.java deleted file mode 100644 index cfb1fe051..000000000 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/ReversePathHierarchyTokenizer.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; - -/** - * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. - * - * It's uses the {@code Tokenizer}'s default (no args) constructor. - * - * @author Juan Pedro Gilaberte {@literal } - */ -public class ReversePathHierarchyTokenizer extends Tokenizer { - - static final Integer DEFAULT_BUFFER_SIZE = 1024; - static final Character DEFAULT_DELIMITER = '/'; - static final Character DEFAULT_REPLACEMENT = '/'; - static final Integer DEFAULT_SKIP = 0; - - /** terms cache read buffer size */ - @JsonProperty("buffer_size") - final Integer bufferSize; - - /** path separator */ - @JsonProperty("delimiter") - final Character delimiter; - - /** a replacement character for delimiter */ - @JsonProperty("replacement") - final Character replacement; - - /** number of initial tokens to skip */ - @JsonProperty("skip") - final Integer skip; - - /** - * Builds a new {@link ReversePathHierarchyTokenizer} using the default bufferSize, delimiter, replacement and - * skip. - */ - @JsonCreator - public ReversePathHierarchyTokenizer() { - this.bufferSize = DEFAULT_BUFFER_SIZE; - this.delimiter = DEFAULT_DELIMITER; - this.replacement = DEFAULT_REPLACEMENT; - this.skip = DEFAULT_SKIP; - } - - /** - * Builds a new {@link ReversePathHierarchyTokenizer} using the specified bufferSize, delimiter, replacement and - * skip. - * - * @param bufferSize terms cache read buffer size - * @param delimiter path separator - * @param replacement a replacement character for delimiter - * @param skip number of initial tokens to skip - */ - @JsonCreator - public ReversePathHierarchyTokenizer(@JsonProperty("buffer_size") Integer bufferSize, - @JsonProperty("delimiter") Character delimiter, - @JsonProperty("replacement") Character replacement, - @JsonProperty("skip") Integer skip) { - this.bufferSize = getOrDefault(bufferSize, DEFAULT_BUFFER_SIZE); - this.delimiter = getOrDefault(delimiter, DEFAULT_DELIMITER); - this.replacement = getOrDefault(replacement, DEFAULT_REPLACEMENT); - this.skip = getOrDefault(skip, DEFAULT_SKIP); - } -} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/Tokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/Tokenizer.java index fc4a28534..2a4e9e1c7 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/Tokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/Tokenizer.java @@ -33,10 +33,8 @@ @JsonSubTypes.Type(value = NGramTokenizer.class, name = "ngram"), @JsonSubTypes.Type(value = PathHierarchyTokenizer.class, name = "path_hierarchy"), @JsonSubTypes.Type(value = PatternTokenizer.class, name = "pattern"), - @JsonSubTypes.Type(value = ReversePathHierarchyTokenizer.class, name = "reverse_path_hierarchy"), @JsonSubTypes.Type(value = StandardTokenizer.class, name = "standard"), @JsonSubTypes.Type(value = UAX29URLEmailTokenizer.class, name = "uax29_url_email"), - @JsonSubTypes.Type(value = UnicodeWhitespaceTokenizer.class, name = "unicode_whitespace"), @JsonSubTypes.Type(value = ThaiTokenizer.class, name = "thai"), @JsonSubTypes.Type(value = WhitespaceTokenizer.class, name = "whitespace"), @JsonSubTypes.Type(value = WikipediaTokenizer.class, name = "wikipedia")}) diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WhitespaceTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WhitespaceTokenizer.java index e07cad7ca..35fe57c0d 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WhitespaceTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WhitespaceTokenizer.java @@ -16,6 +16,7 @@ package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; /** * {@link Tokenizer} using a Lucene's {@code Tokenizer}s in classpath. @@ -26,10 +27,23 @@ */ public class WhitespaceTokenizer extends Tokenizer { + private final String RULE = "java"; + + /** terms cache read buffer size */ + @JsonProperty("rule") + final String rule; + /** - * Builds a new {@link ThaiTokenizer} + * Builds a new {@link WhitespaceTokenizer} */ @JsonCreator public WhitespaceTokenizer() { + this.rule = RULE; + } + + @JsonCreator + public WhitespaceTokenizer(String rule) { + this.rule = rule; } + } diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java index 3f28715b1..828462c80 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java @@ -30,76 +30,80 @@ */ public class WikipediaTokenizer extends Tokenizer { - static final TokenOutputValue DEFAULT_TOKEN_OUTPUT = TokenOutputValue.TOKENS_ONLY; - static final Set DEFAULT_UNTOKENIZED_TYPES = Collections.emptySet(); - - static final int TOKENS_ONLY_VAR = 0; - static final int UNTOKENIZED_ONLY_VAR = 1; - static final int BOTH_VAR = 2; - - public enum TokenOutputValue { - - TOKENS_ONLY("TOKENS_ONLY", TOKENS_ONLY_VAR), - UNTOKENIZED_ONLY("UNTOKENIZED_ONLY", UNTOKENIZED_ONLY_VAR), - BOTH("BOTH", BOTH_VAR); - - private int integerValue; - private String stringValue; - - TokenOutputValue(String name, int value) { - this.stringValue = name; - this.integerValue = value; - } - - @JsonCreator - public static TokenOutputValue create(String value) { - if (value == null) { - throw new IllegalArgumentException(); - } - for (TokenOutputValue v : values()) { - if (v.getStringValue().equals(value)) { - return v; - } - } - throw new IllegalArgumentException(); - } - - public int getIntegerValue() { - return integerValue; - } - - public String getStringValue() { - return stringValue; - } - } - - /** this tokenizer output, only untokenized, only tokens or both */ - @JsonProperty("token_output") - final TokenOutputValue tokenOutput; - /** //TODO */ - @JsonProperty("untokenized_types") - final Set untokenizedTypes; - - /** - * Builds a new {@link WikipediaTokenizer} using the default tokenOutput and untokenizedTypes. - * - */ @JsonCreator - public WikipediaTokenizer() { - this.tokenOutput = DEFAULT_TOKEN_OUTPUT; - this.untokenizedTypes = DEFAULT_UNTOKENIZED_TYPES; - } + public WikipediaTokenizer() {} - /** - * Builds a new {@link WikipediaTokenizer} using the specified tokenOutput and untokenizedTypes. - * - * @param tokenOutput this tokenizer output, only untokenized, only tokens or both - * @param untokenizedTypes //TODO - */ - @JsonCreator - public WikipediaTokenizer(@JsonProperty("token_output") WikipediaTokenizer.TokenOutputValue tokenOutput, - @JsonProperty("untokenized_types") Set untokenizedTypes) { - this.tokenOutput = getOrDefault(tokenOutput, DEFAULT_TOKEN_OUTPUT); - this.untokenizedTypes = getOrDefault(untokenizedTypes, DEFAULT_UNTOKENIZED_TYPES); - } +// TODO: refactor wikipedia factory with advanced parameters +// static final TokenOutputValue DEFAULT_TOKEN_OUTPUT = TokenOutputValue.TOKENS_ONLY; +// static final Set DEFAULT_UNTOKENIZED_TYPES = Collections.emptySet(); +// +// static final int TOKENS_ONLY_VAR = 0; +// static final int UNTOKENIZED_ONLY_VAR = 1; +// static final int BOTH_VAR = 2; +// +// public enum TokenOutputValue { +// +// TOKENS_ONLY("TOKENS_ONLY", TOKENS_ONLY_VAR), +// UNTOKENIZED_ONLY("UNTOKENIZED_ONLY", UNTOKENIZED_ONLY_VAR), +// BOTH("BOTH", BOTH_VAR); +// +// private int integerValue; +// private String stringValue; +// +// TokenOutputValue(String name, int value) { +// this.stringValue = name; +// this.integerValue = value; +// } +// +// @JsonCreator +// public static TokenOutputValue create(String value) { +// if (value == null) { +// throw new IllegalArgumentException(); +// } +// for (TokenOutputValue v : values()) { +// if (v.getStringValue().equals(value)) { +// return v; +// } +// } +// throw new IllegalArgumentException(); +// } +// +// public int getIntegerValue() { +// return integerValue; +// } +// +// public String getStringValue() { +// return stringValue; +// } +// } +// +// /** this tokenizer output, only untokenized, only tokens or both */ +// @JsonProperty("token_output") +// final TokenOutputValue tokenOutput; +// /** //TODO */ +// @JsonProperty("untokenized_types") +// final Set untokenizedTypes; +// +// /** +// * Builds a new {@link WikipediaTokenizer} using the default tokenOutput and untokenizedTypes. +// * +// */ +// @JsonCreator +// public WikipediaTokenizer() { +// this.tokenOutput = DEFAULT_TOKEN_OUTPUT; +// this.untokenizedTypes = DEFAULT_UNTOKENIZED_TYPES; +// } +// +// /** +// * Builds a new {@link WikipediaTokenizer} using the specified tokenOutput and untokenizedTypes. +// * +// * @param tokenOutput this tokenizer output, only untokenized, only tokens or both +// * @param untokenizedTypes //TODO +// */ +// @JsonCreator +// public WikipediaTokenizer(@JsonProperty("token_output") WikipediaTokenizer.TokenOutputValue tokenOutput, +// @JsonProperty("untokenized_types") Set untokenizedTypes) { +// this.tokenOutput = getOrDefault(tokenOutput, DEFAULT_TOKEN_OUTPUT); +// this.untokenizedTypes = getOrDefault(untokenizedTypes, DEFAULT_UNTOKENIZED_TYPES); +// } } \ No newline at end of file diff --git a/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/CustomAnalyzertest.java b/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/CustomAnalyzertest.java new file mode 100644 index 000000000..8476ca232 --- /dev/null +++ b/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/CustomAnalyzertest.java @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; + +import com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenFilter.AsciifoldingTokenFilter; +import com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenFilter.LowercaseTokenFilter; +import com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenFilter.TokenFilter; +import org.junit.Test; + +import static com.stratio.cassandra.lucene.builder.Builder.*; +import static com.stratio.cassandra.lucene.builder.Builder.stringMapper; +import static com.stratio.cassandra.lucene.builder.Builder.uuidMapper; +import static org.junit.Assert.assertEquals; + +/** + * Created by jpgilaberte on 2/06/17. + */ +public class CustomAnalyzertest { + + @Test + public void testIndexFull() { + String actual = index("ks", "table", "idx").keyspace("keyspace") + .column("lucene") + .directoryPath("path") + .refreshSeconds(10D) + .maxCachedMb(32) + .maxMergeMb(16) + .ramBufferMb(64) + .indexingThreads(4) + .indexingQueuesSize(100) + .excludedDataCenters("DC1,DC2") + .sparse(true) + .partitioner(partitionerOnToken(8)) + .defaultAnalyzer("my_analyzer") + .analyzer("my_analyzer", customAnalyzer(new WhitespaceTokenizer(), + null, + new TokenFilter[]{new AsciifoldingTokenFilter(), new LowercaseTokenFilter()})) + .analyzer("snow", snowballAnalyzer("tartar").stopwords("a,b,c")) + .mapper("uuid", uuidMapper().validated(true)) + .mapper("string", stringMapper()) + .build(); + String expected = "CREATE CUSTOM INDEX idx ON keyspace.table(lucene) " + + "USING 'com.stratio.cassandra.lucene.Index' " + + "WITH OPTIONS = {" + + "'refresh_seconds':'10.0'," + + "'directory_path':'path'," + + "'ram_buffer_mb':'64'," + + "'max_merge_mb':'16'," + + "'max_cached_mb':'32'," + + "'indexing_threads':'4'," + + "'indexing_queues_size':'100'," + + "'excluded_data_centers':'DC1,DC2'," + + "'partitioner':'{\"type\":\"token\",\"partitions\":8}'," + + "'sparse':'true'," + + "'schema':'{" + + "\"default_analyzer\":\"my_analyzer\",\"analyzers\":{" + + "\"my_analyzer\":{\"type\":\"custom\",\"tokenizer\":{\"type\":\"whitespace\"},\"token_filter\":[{\"type\":\"asciifolding\",\"preserveOriginal\":false},{\"type\":\"lowercase\"}]}," + + "\"snow\":{\"type\":\"snowball\",\"language\":\"tartar\",\"stopwords\":\"a,b,c\"}}," + + "\"fields\":{" + + "\"uuid\":{\"type\":\"uuid\",\"validated\":true},\"string\":{\"type\":\"string\"}}}'}"; + assertEquals("index serialization is wrong", expected, actual); + } +} diff --git a/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/TokenizerTest.java b/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/TokenizerTest.java index 1f773528f..6b4b77d7f 100644 --- a/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/TokenizerTest.java +++ b/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/TokenizerTest.java @@ -15,14 +15,12 @@ */ package com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer; -import com.google.common.collect.Sets; import com.stratio.cassandra.lucene.common.JsonSerializer; import org.junit.Test; - import java.io.IOException; - import static org.junit.Assert.*; + /** * @author Juan Pedro Gilaberte {@literal } */ @@ -43,21 +41,6 @@ private void assertJsonParseFail(String json) throws IOException { JsonSerializer.fromString(json, Tokenizer.class); } - private void assertJsonParseFail(String json, String message) { - try { - JsonSerializer.fromString(json, Tokenizer.class); - } catch (IOException e) { - assertEquals("Expected IOException with message: " + - message + - " but received: " + - e.getMessage() + - " localMess: " + - e.getLocalizedMessage(), message, e.getMessage()); - } - assertFalse("Parsing: " + json + " must generate an IOException with message: " + message + " but does not.", - true); - } - private void assertExactValue(String paramName, Object expected, Object received) { assertEquals("Expected " + paramName + @@ -67,13 +50,6 @@ private void assertExactValue(String paramName, Object expected, Object received received.toString(), expected, received); } - @Test - public void testClassicTokenizerValidJSON() { - String json = "{type: \"classic\", max_token_length: 250}"; - ClassicTokenizer builder = assertAndTokenizer(json, ClassicTokenizer.class); - assertExactValue("ClassicTokenizer.maxTokenLength", 250, builder.maxTokenLength); - } - @Test public void testClassicTokenizerDefaultValues() { ClassicTokenizer builder = assertAndTokenizer("{type: \"classic\"}", ClassicTokenizer.class); @@ -89,20 +65,13 @@ public void testClassicTokenizerInvalidParam() throws IOException { @Test public void testKeywordTokenizerValidJSON() { - String json = "{type: \"keyword\", buffer_size: 256}"; + String json = "{type: \"keyword\"}"; KeywordTokenizer builder = assertAndTokenizer(json, KeywordTokenizer.class); - assertExactValue("KeywordTokenizer.bufferSize", 256, builder.bufferSize); } @Test public void testKeywordTokenizerDefaultValues() { KeywordTokenizer builder = assertAndTokenizer("{type: \"keyword\"}", KeywordTokenizer.class); - assertExactValue("ClassicTokenizer.maxTokenLength", KeywordTokenizer.DEFAULT_BUFFER_SIZE, builder.bufferSize); - } - - @Test(expected = IOException.class) - public void testKeywordTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"keyword\", bufer_size: 256}"); } @Test @@ -122,18 +91,18 @@ public void testThaiTokenizerValidJSON() { @Test public void testNGramTokenizerValidJSON() { - String json = "{type: \"ngram\", min_gram: 1, max_gram: 2}"; + String json = "{type: \"ngram\", min_gram_size: 1, max_gram_size: 2}"; NGramTokenizer builder = assertAndTokenizer(json, NGramTokenizer.class); - assertExactValue("NGramTokenizer.min_gram", NGramTokenizer.DEFAULT_MIN_GRAM, builder.minGram); - assertExactValue("NGramTokenizer.max_gram", NGramTokenizer.DEFAULT_MAX_GRAM, builder.maxGram); + assertExactValue("NGramTokenizer.min_gram_size", NGramTokenizer.DEFAULT_MIN_GRAM, builder.minGramSize); + assertExactValue("NGramTokenizer.max_gram_size", NGramTokenizer.DEFAULT_MAX_GRAM, builder.maxGramSize); } @Test public void testNGramTokenizerDefaultValues() { String json = "{type: \"ngram\"}"; NGramTokenizer builder = assertAndTokenizer(json, NGramTokenizer.class); - assertExactValue("NGramTokenizer.min_gram", NGramTokenizer.DEFAULT_MIN_GRAM, builder.minGram); - assertExactValue("NGramTokenizer.max_gram", NGramTokenizer.DEFAULT_MAX_GRAM, builder.maxGram); + assertExactValue("NGramTokenizer.min_gram_size", NGramTokenizer.DEFAULT_MIN_GRAM, builder.minGramSize); + assertExactValue("NGramTokenizer.max_gram_size", NGramTokenizer.DEFAULT_MAX_GRAM, builder.maxGramSize); } @Test(expected = IOException.class) @@ -143,18 +112,18 @@ public void testNGramTokenizerInvalidJSON() throws IOException { @Test public void testEdgeNGramTokenizerValidJSON() { - String json = "{type: \"edge_ngram\", min_gram: 1, max_gram: 2}"; + String json = "{type: \"edge_ngram\", min_gram_size: 1, max_gram_size: 2}"; EdgeNGramTokenizer builder = assertAndTokenizer(json, EdgeNGramTokenizer.class); - assertExactValue("EdgeNGramTokenizer.min_gram", EdgeNGramTokenizer.DEFAULT_MIN_GRAM, builder.minGram); - assertExactValue("EdgeNGramTokenizer.max_gram", EdgeNGramTokenizer.DEFAULT_MAX_GRAM, builder.maxGram); + assertExactValue("EdgeNGramTokenizer.min_gram_size", EdgeNGramTokenizer.DEFAULT_MIN_GRAM, builder.minGramSize); + assertExactValue("EdgeNGramTokenizer.max_gram_size", EdgeNGramTokenizer.DEFAULT_MAX_GRAM, builder.maxGramSize); } @Test public void testEdgeNGramTokenizerDefaultValues() { String json = "{type: \"edge_ngram\"}"; EdgeNGramTokenizer builder = assertAndTokenizer(json, EdgeNGramTokenizer.class); - assertExactValue("EdgeNGramTokenizer.min_gram", EdgeNGramTokenizer.DEFAULT_MIN_GRAM, builder.minGram); - assertExactValue("EdgeNGramTokenizer.max_gram", EdgeNGramTokenizer.DEFAULT_MAX_GRAM, builder.maxGram); + assertExactValue("EdgeNGramTokenizer.min_gram", EdgeNGramTokenizer.DEFAULT_MIN_GRAM, builder.minGramSize); + assertExactValue("EdgeNGramTokenizer.max_gram", EdgeNGramTokenizer.DEFAULT_MAX_GRAM, builder.maxGramSize); } @Test(expected = IOException.class) @@ -164,11 +133,11 @@ public void testEdgeNGramTokenizerInvalidJSON() throws IOException { @Test public void testPathHierarchyTokenizerValidJSON() { - String json = "{type: \"path_hierarchy\", buffer_size: 246, delimiter: \"$\", replacement: \"%\", skip: 3}"; + String json = "{type: \"path_hierarchy\", reverse: false, delimiter: \"$\", replace: \"%\", skip: 3}"; PathHierarchyTokenizer builder = assertAndTokenizer(json, PathHierarchyTokenizer.class); - assertExactValue("PathHierarchyTokenizer.buffer_size", 246, builder.bufferSize); + assertExactValue("PathHierarchyTokenizer.buffer_size", false, builder.reverse); assertExactValue("PathHierarchyTokenizer.delimiter", '$', builder.delimiter); - assertExactValue("PathHierarchyTokenizer.replacement", '%', builder.replacement); + assertExactValue("PathHierarchyTokenizer.replace", '%', builder.replace); assertExactValue("PathHierarchyTokenizer.skip", 3, builder.skip); } @@ -176,29 +145,28 @@ public void testPathHierarchyTokenizerValidJSON() { public void testPathHierarchyTokenizerDefaultValues() { String json = "{type: \"path_hierarchy\"}"; PathHierarchyTokenizer builder = assertAndTokenizer(json, PathHierarchyTokenizer.class); - assertExactValue("PathHierarchyTokenizer.buffer_size", - PathHierarchyTokenizer.DEFAULT_BUFFER_SIZE, - builder.bufferSize); + assertExactValue("PathHierarchyTokenizer.reverse", + PathHierarchyTokenizer.REVERSE, + builder.reverse); assertExactValue("PathHierarchyTokenizer.delimiter", PathHierarchyTokenizer.DEFAULT_DELIMITER, builder.delimiter); - assertExactValue("PathHierarchyTokenizer.replacement", + assertExactValue("PathHierarchyTokenizer.replace", PathHierarchyTokenizer.DEFAULT_REPLACEMENT, - builder.replacement); + builder.replace); assertExactValue("PathHierarchyTokenizer.skip", PathHierarchyTokenizer.DEFAULT_SKIP, builder.skip); } @Test(expected = IOException.class) public void testPathHierarchyTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"path_hierarchy\", buffer_size: 246, delimter: \"$\", replacement: \"%\", skip: 3}"); + assertJsonParseFail("{type: \"path_hierarchy\", reverse: false, delimter: \"$\", replace: \"%\", skip: 3}"); } @Test public void testPatternTokenizerValidJSON() { - String json = "{type: \"pattern\", pattern: \"[a-z]\", flags: 35, group: 0}"; + String json = "{type: \"pattern\", pattern: \"[a-z]\", group: 0}"; PatternTokenizer builder = assertAndTokenizer(json, PatternTokenizer.class); assertExactValue("PathHierarchyTokenizer.pattern", "[a-z]", builder.pattern); - assertExactValue("PathHierarchyTokenizer.flags", 35, builder.flags); assertExactValue("PathHierarchyTokenizer.group", 0, builder.group); } @@ -208,46 +176,11 @@ public void testPatternTokenizerDefaultValues() { PatternTokenizer builder = assertAndTokenizer(json, PatternTokenizer.class); assertExactValue("PathHierarchyTokenizer.pattern", PatternTokenizer.DEFAULT_PATTERN, builder.pattern); assertExactValue("PathHierarchyTokenizer.group", PatternTokenizer.DEFAULT_GROUP, builder.group); - assertExactValue("PathHierarchyTokenizer.group", PatternTokenizer.DEFAULT_FLAGS, builder.flags); } @Test(expected = IOException.class) public void testPatternTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"pattern\", paern: \"[a-z]\", flags: 35, group: 0}"); - } - - @Test - public void testReversePathHierarchyTokenizerValidJSON() { - String - json - = "{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacement: \"%\", skip: 3}"; - ReversePathHierarchyTokenizer builder = assertAndTokenizer(json, ReversePathHierarchyTokenizer.class); - assertExactValue("ReversePathHierarchyTokenizer.buffer_size", 246, builder.bufferSize); - assertExactValue("ReversePathHierarchyTokenizer.delimiter", '/', builder.delimiter); - assertExactValue("ReversePathHierarchyTokenizer.replacement", '%', builder.replacement); - assertExactValue("ReversePathHierarchyTokenizer.skip", 3, builder.skip); - } - - @Test - public void testReversePathHierarchyTokenizerDefaultValues() { - String json = "{type: \"reverse_path_hierarchy\"}"; - ReversePathHierarchyTokenizer builder = assertAndTokenizer(json, ReversePathHierarchyTokenizer.class); - assertExactValue("PathHierarchyTokenizer.buffer_size", - ReversePathHierarchyTokenizer.DEFAULT_BUFFER_SIZE, - builder.bufferSize); - assertExactValue("PathHierarchyTokenizer.delimiter", - ReversePathHierarchyTokenizer.DEFAULT_DELIMITER, - builder.delimiter); - assertExactValue("PathHierarchyTokenizer.replacement", - ReversePathHierarchyTokenizer.DEFAULT_REPLACEMENT, - builder.replacement); - assertExactValue("PathHierarchyTokenizer.skip", ReversePathHierarchyTokenizer.DEFAULT_SKIP, builder.skip); - } - - @Test(expected = IOException.class) - public void testReversePathHierarchyTokenizerInvalidJSON() throws IOException { - assertJsonParseFail( - "{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacent: \"%\", skip: 3}"); + assertJsonParseFail("{type: \"pattern\", paern: \"[a-z]\", group: 0}"); } @Test @@ -291,44 +224,38 @@ public void testUAX29URLEmailTokenizerInvalidJSON() throws IOException { assertJsonParseFail("{type: \"uax29_url_email\", max_token_lgth: 249}"); } - @Test - public void testUnicodeWhitespaceTokenizerValidJSON() { - String json = "{type:\"unicode_whitespace\"}"; - assertAndTokenizer(json, UnicodeWhitespaceTokenizer.class); - } - @Test public void testWhitespaceTokenizerValidJSON() { String json = "{type:\"whitespace\"}"; assertAndTokenizer(json, WhitespaceTokenizer.class); } - @Test - public void testWikipediaTokenizerValidJSON() { - String json = "{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untokenized_types : [\"aaa\",\"bbb\"]}"; - WikipediaTokenizer builder = assertAndTokenizer(json, WikipediaTokenizer.class); - assertExactValue("WikipediaTokenizer.token_output", - WikipediaTokenizer.TokenOutputValue.TOKENS_ONLY, - builder.tokenOutput); - assertExactValue("WikipediaTokenizer.untokenized_types", - Sets.newHashSet("aaa", "bbb"), - builder.untokenizedTypes); - } - - @Test - public void testWikipediaTokenizerDefaultValues() { - String json = "{type: \"wikipedia\"}"; - WikipediaTokenizer builder = assertAndTokenizer(json, WikipediaTokenizer.class); - assertExactValue("WikipediaTokenizer.token_output", - WikipediaTokenizer.TokenOutputValue.TOKENS_ONLY, - builder.tokenOutput); - assertExactValue("WikipediaTokenizer.untokenized_types", Sets.newHashSet(), builder.untokenizedTypes); - } - - @Test(expected = IOException.class) - public void testWikipediaTokenizerInvalidJSON() throws IOException { - assertJsonParseFail("{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untoknized_types : [\"aaa\",\"bbb\"]}"); - } +// @Test +// public void testWikipediaTokenizerValidJSON() { +// String json = "{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untokenized_types : [\"aaa\",\"bbb\"]}"; +// WikipediaTokenizer builder = assertAndTokenizer(json, WikipediaTokenizer.class); +// assertExactValue("WikipediaTokenizer.token_output", +// WikipediaTokenizer.TokenOutputValue.TOKENS_ONLY, +// builder.tokenOutput); +// assertExactValue("WikipediaTokenizer.untokenized_types", +// Sets.newHashSet("aaa", "bbb"), +// builder.untokenizedTypes); +// } +// +// @Test +// public void testWikipediaTokenizerDefaultValues() { +// String json = "{type: \"wikipedia\"}"; +// WikipediaTokenizer builder = assertAndTokenizer(json, WikipediaTokenizer.class); +// assertExactValue("WikipediaTokenizer.token_output", +// WikipediaTokenizer.TokenOutputValue.TOKENS_ONLY, +// builder.tokenOutput); +// assertExactValue("WikipediaTokenizer.untokenized_types", Sets.newHashSet(), builder.untokenizedTypes); +// } +// +// @Test(expected = IOException.class) +// public void testWikipediaTokenizerInvalidJSON() throws IOException { +// assertJsonParseFail("{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untoknized_types : [\"aaa\",\"bbb\"]}"); +// } @Test(expected = IOException.class) public void testInvalidTokenizerType() throws IOException { From 759c2486265497a30655e439b47e2a112e1ae1dc Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Fri, 2 Jun 2017 12:15:02 +0200 Subject: [PATCH 14/40] Add plugin Test --- .../schema/analysis/AnalyzerBuilder.java | 1 + .../analysis/CustomAnalyzerBuilder.java | 53 --- .../tokenizer/ClassicTokenizerBuilder.java | 16 - .../tokenizer/EdgeNGramTokenizerBuilder.java | 16 - .../tokenizer/KeywordTokenizerBuilder.java | 16 - .../tokenizer/LetterTokenizerBuilder.java | 17 - .../tokenizer/LowerCaseTokenizerBuilder.java | 16 - .../tokenizer/NGramTokenizerBuilder.java | 16 - .../PathHierarchyTokenizerBuilder.java | 16 - .../tokenizer/PatternTokenizerBuilder.java | 17 - .../ReversePathHierarchyTokenizerBuilder.java | 17 - .../tokenizer/StandardTokenizerBuilder.java | 17 - .../tokenizer/ThaiTokenizerBuilder.java | 16 - .../analysis/tokenizer/TokenizerBuilder.java | 16 - .../UAX29URLEmailTokenizerBuilder.java | 18 - .../UnicodeWhitespaceTokenizerBuilder.java | 16 - .../tokenizer/WhitespaceTokenizerBuilder.java | 16 - .../tokenizer/WikipediaTokenizerBuilder.java | 16 - .../lucene/schema/analysis/Builder.scala | 83 ++++ .../analyzer/CustomAnalyzerBuilder.scala | 76 ++++ plugin/src/test/resources/MappingCharFilter | 4 + .../lucene/schema/analysis/BuilderTest.scala | 116 ++++++ .../analyzer/CustomAnalyzerTest.scala | 101 +++++ .../charFilter/CharFilterBuilderTest.scala | 78 ++++ .../tokenFilter/TokenFilterBuilderTest.scala | 61 +++ .../tokenizer/TokenizerBuilderTest.scala | 370 +++++++----------- 26 files changed, 667 insertions(+), 538 deletions(-) delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java delete mode 100644 plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/Builder.scala create mode 100644 plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/analyzer/CustomAnalyzerBuilder.scala create mode 100644 plugin/src/test/resources/MappingCharFilter create mode 100644 plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/BuilderTest.scala create mode 100644 plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/analyzer/CustomAnalyzerTest.scala create mode 100644 plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilderTest.scala create mode 100644 plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilderTest.scala diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/AnalyzerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/AnalyzerBuilder.java index ed8ff267a..0382b7db4 100644 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/AnalyzerBuilder.java +++ b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/AnalyzerBuilder.java @@ -17,6 +17,7 @@ import com.fasterxml.jackson.annotation.JsonSubTypes; import com.fasterxml.jackson.annotation.JsonTypeInfo; +import com.stratio.cassandra.lucene.schema.analysis.analyzer.CustomAnalyzerBuilder; import org.apache.lucene.analysis.Analyzer; /** diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzerBuilder.java deleted file mode 100644 index 74bc236e1..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/CustomAnalyzerBuilder.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis; - -/** - * @author Eduardo Alonso {@literal } - */ - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import com.stratio.cassandra.lucene.schema.analysis.tokenizer.TokenizerBuilder; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Tokenizer; - -/** - * {@link AnalyzerBuilder} for building {@link Analyzer}s based on an advanced configuration. - */ -public class CustomAnalyzerBuilder extends AnalyzerBuilder { - - @JsonProperty("tokenizer") - private final TokenizerBuilder tokenizer; - - /** - * Builds a new {@link AnalyzerBuilder} using custom tokenizer, char_filters and token_filters. - * - * @param tokenizer an {@link TokenizerBuilder} the tookenizer to use. - */ - @JsonCreator - public CustomAnalyzerBuilder(@JsonProperty("tokenizer") TokenizerBuilder tokenizer) { - this.tokenizer = tokenizer; - } - - /** {@inheritDoc} */ - @Override - public Analyzer analyzer() { - return new CustomAnalyzer((Tokenizer)tokenizer.buildTokenizer()); - } -} - - diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.java deleted file mode 100644 index 18edfcd00..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ClassicTokenizerBuilder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java deleted file mode 100644 index 18edfcd00..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/EdgeNGramTokenizerBuilder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.java deleted file mode 100644 index 18edfcd00..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/KeywordTokenizerBuilder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.java deleted file mode 100644 index 9f3308381..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LetterTokenizerBuilder.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java deleted file mode 100644 index 18edfcd00..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/LowerCaseTokenizerBuilder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.java deleted file mode 100644 index 18edfcd00..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/NGramTokenizerBuilder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java deleted file mode 100644 index 18edfcd00..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PathHierarchyTokenizerBuilder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.java deleted file mode 100644 index 9f3308381..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/PatternTokenizerBuilder.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.java deleted file mode 100644 index 9f3308381..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ReversePathHierarchyTokenizerBuilder.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.java deleted file mode 100644 index 9f3308381..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/StandardTokenizerBuilder.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java deleted file mode 100644 index 18edfcd00..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/ThaiTokenizerBuilder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java deleted file mode 100644 index 18edfcd00..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java deleted file mode 100644 index 65b467cfd..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UAX29URLEmailTokenizerBuilder.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; - - diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.java deleted file mode 100644 index 18edfcd00..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/UnicodeWhitespaceTokenizerBuilder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java deleted file mode 100644 index 18edfcd00..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WhitespaceTokenizerBuilder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; diff --git a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java b/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java deleted file mode 100644 index 18edfcd00..000000000 --- a/plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/tokenizer/WikipediaTokenizerBuilder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (C) 2014 Stratio (http://stratio.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.stratio.cassandra.lucene.schema.analysis.tokenizer; diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/Builder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/Builder.scala new file mode 100644 index 000000000..af6ef7a0a --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/Builder.scala @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis + +import java.util +import collection.JavaConverters._ +import scala.reflect.runtime.{universe=>ru} + +/** + * @author Juan Pedro Gilaberte jpgilaberte@stratio.com + */ +trait Builder[T] { + + /** + * Implement function to return Lucene object + * + * @return + */ + def buildFunction : () => T + + //TODO: refactor scala style (remove throw and manage exception in centralized layer) + /** + * Auxiliary function to manage Java Exceptions (Lucene's layer) + * + * @param throwable + * @return + */ + def failThrowException(throwable: Throwable) = throw throwable + + /** + * Manage Java Exceptions (Lucene's layer) + * + * @return the built analyzer + */ + def build: T = { + import scala.util.control.Exception._ + catching(classOf[Exception]).either(buildFunction()).asInstanceOf[Either[Exception, T]].fold(failThrowException, x=>x) + } + + /** + * Set param if apply or default value in other case + * + * @param param the main parameter. + * @param defaultParam the default parameter if main paramaeter is null. + * @return if (param!=null) { return param; }else{ return defaultParam; } + */ + def getOrDefault(param: Option[Any], defaultParam: Any): Any = param.map(x => x).getOrElse(defaultParam) + + /** + * + * @return + */ + def termSymbolsList = scala.reflect.runtime.currentMirror.classSymbol(this.getClass).toType + .members.collect { case m: ru.TermSymbol if m.isGetter => m }.map(_.asTerm) + + /** + * + * @param termString + * @return + */ + def reflectedFieldValue(termString: ru.TermSymbol) = ru.runtimeMirror(this.getClass.getClassLoader) + .reflect(this).reflectField(termString).get + + /** + * + * @return + */ + def mapParsed = new util.HashMap[String, String](termSymbolsList.collect({case tm: ru.TermSymbol if reflectedFieldValue(tm) != null => tm}) + .map(x => (x.name.toString, reflectedFieldValue(x).toString)).toMap.asJava) +} \ No newline at end of file diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/analyzer/CustomAnalyzerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/analyzer/CustomAnalyzerBuilder.scala new file mode 100644 index 000000000..f55f36e46 --- /dev/null +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/analyzer/CustomAnalyzerBuilder.scala @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.analyzer + +import java.io.StringReader +import java.util +import com.fasterxml.jackson.annotation.JsonProperty +import com.stratio.cassandra.lucene.schema.analysis.AnalyzerBuilder +import com.stratio.cassandra.lucene.schema.analysis.charFilter.CharFilterBuilder +import com.stratio.cassandra.lucene.schema.analysis.tokenFilter.TokenFilterBuilder +import com.stratio.cassandra.lucene.schema.analysis.tokenizer.TokenizerBuilder +import org.apache.lucene.analysis.{TokenStream, Analyzer} +import org.apache.lucene.analysis.custom.CustomAnalyzer +import org.apache.lucene.analysis.util.{CharFilterFactory, TokenFilterFactory, TokenizerFactory} + +import scala.util.{Failure, Success, Try} + + +/** + * Created by jpgilaberte on 24/05/17. + */ +final case class CustomAnalyzerBuilder( @JsonProperty("tokenizer") tokenizer: TokenizerBuilder[_], + @JsonProperty("char_filter") charFilter: Array[CharFilterBuilder[_]], + @JsonProperty("token_filter") tokenFilter: Array[TokenFilterBuilder[_]] + ) extends AnalyzerBuilder{ + + override def analyzer(): Analyzer = { + val custom = CustomAnalyzer.builder() + + def validateTokenizer(tkf: TokenizerFactory) = tkf.create() // Validate params before index creation + def addTokenizerFactory(tkf: TokenizerFactory) = custom.withTokenizer(tkf.getClass, new util.HashMap[String, String](tkf.getOriginalArgs)) + Try (tokenizer.build.asInstanceOf[TokenizerFactory]) match { + case Success(tokenizerFactory) => {validateTokenizer(tokenizerFactory); addTokenizerFactory(tokenizerFactory)} + case Failure(e) => {/*Tokenizer are mandatory. Validate if it is present in CustomAnalyzer layer*/} + } + + def validateTokenFilter(tkff: TokenFilterFactory) = tkff.create(new TokenStream() {override def incrementToken(): Boolean = true }) // Validate params before index creation + def addTokenFilterFactory(tkff: TokenFilterFactory) = custom.addTokenFilter(tkff.getClass, new util.HashMap[String, String](tkff.getOriginalArgs)) + tokenFilter match { + case null => {/*java legacy*/} + case _ => { tokenFilter.map(x => { + val tokenFilter = x.build.asInstanceOf[TokenFilterFactory] + validateTokenFilter(tokenFilter) + addTokenFilterFactory(tokenFilter) + }) + } + } + + def validateCharFilter(cff: CharFilterFactory) = cff.create(new StringReader("validate")) // Validate params before index creation + def addCharFilterFactory(cff: CharFilterFactory) = custom.addCharFilter(cff.getClass, new util.HashMap[String, String](cff.getOriginalArgs)) + charFilter match { + case null => {/*java legacy*/} + case _ => { charFilter.map(x => { + val charFilter = x.build.asInstanceOf[CharFilterFactory] + validateCharFilter(charFilter) + addCharFilterFactory(charFilter) + }) + } + } + + custom.build() + } +} diff --git a/plugin/src/test/resources/MappingCharFilter b/plugin/src/test/resources/MappingCharFilter new file mode 100644 index 000000000..9060ff1ef --- /dev/null +++ b/plugin/src/test/resources/MappingCharFilter @@ -0,0 +1,4 @@ +"aa"=>"a" +"bb"=>"b" +"f"=>"z" +"F"=>"Z" \ No newline at end of file diff --git a/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/BuilderTest.scala b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/BuilderTest.scala new file mode 100644 index 000000000..b1ecfa724 --- /dev/null +++ b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/BuilderTest.scala @@ -0,0 +1,116 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis + +import com.stratio.cassandra.lucene.BaseScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +/** Tests for [[Builder]]. + * + * @author Juan Pedro Gilaberte `jpgilaberte@stratio.com` + */ +@RunWith(classOf[JUnitRunner]) +class BuilderTest extends BaseScalaTest { + + class TestFactory(){ def helloTest = "helloTest"} + sealed abstract class TestBuilder[T](typeTest: String) extends Builder[T]{ + def buildFunction : () => T = () => { + new TestFactory().asInstanceOf[T] + } + } + case class TestBuilderChild(param1: String = "test", param2: Long = 1l, param3: Integer = 2) extends TestBuilder[TestFactory]("test") + case class TestNullBuilderChild(param1: String = null, param2: Long = 1l, param3: Integer = 2) extends TestBuilder[TestFactory]("test") + case class TestExceptionBuilderChild(param1: String = "testException", param2: Long = 1l, param3: Integer = 2) extends TestBuilder[TestFactory]("test"){ + override def buildFunction : () => TestFactory = () => { + throw new RuntimeException("exception in Lucene's layer") + } + } + + //Exception + test("exception build function") { + val testBuilderChild = TestExceptionBuilderChild() + assertThrows[RuntimeException](testBuilderChild.build) + } + + //Null (Jackson doesn't inform value) + test("null termSymbolsList function") { + val testBuilderChild = TestNullBuilderChild() + val termSymbol = testBuilderChild.termSymbolsList.toList + assert(termSymbol.size == 3) + } + + test("null reflectedFieldValue function") { + val testBuilderChild = TestNullBuilderChild() + val termSymbol = testBuilderChild.termSymbolsList + val reflectFieldValueTest1 = testBuilderChild.reflectedFieldValue(termSymbol.toList.apply(0)) + val reflectFieldValueTest2 = testBuilderChild.reflectedFieldValue(termSymbol.toList.apply(1)) + val reflectFieldValueTest3 = testBuilderChild.reflectedFieldValue(termSymbol.toList.apply(2)) + assert(reflectFieldValueTest1 == 2) + assert(reflectFieldValueTest2 == 1l) + assert(reflectFieldValueTest3 == null) + } + + test("null mapParsedFunction function") { +// val testBuilderChild = TestNullBuilderChild() +// val mapParsedTest = testBuilderChild.mapParsed +// assert(mapParsedTest.size == 2) +// assertThrows[NoSuchElementException](mapParsedTest.apply("param1") == "test") +// assert(mapParsedTest.apply("param2") == "1") +// assert(mapParsedTest.apply("param3") == "2") + } + + test("null build function") { + val testBuilderChild = TestNullBuilderChild() + val factoryTest = testBuilderChild.build + assert(factoryTest.helloTest == "helloTest") + } + + + // Right + test("termSymbolsList function") { + val testBuilderChild = TestBuilderChild() + val termSymbol = testBuilderChild.termSymbolsList.toList + assert(termSymbol.size == 3) + } + + test("reflectedFieldValue function") { + val testBuilderChild = TestBuilderChild() + val termSymbol = testBuilderChild.termSymbolsList + val reflectFieldValueTest1 = testBuilderChild.reflectedFieldValue(termSymbol.toList.apply(0)) + val reflectFieldValueTest2 = testBuilderChild.reflectedFieldValue(termSymbol.toList.apply(1)) + val reflectFieldValueTest3 = testBuilderChild.reflectedFieldValue(termSymbol.toList.apply(2)) + assert(reflectFieldValueTest1 == 2) + assert(reflectFieldValueTest2 == 1l) + assert(reflectFieldValueTest3 == "test") + } + + test("mapParsed function") { +// val testBuilderChild = TestBuilderChild() +// val mapParsedTest = testBuilderChild.mapParsed +// assert(mapParsedTest.size == 3) +// assert(mapParsedTest.apply("param1") == "test") +// assert(mapParsedTest.apply("param2") == "1") +// assert(mapParsedTest.apply("param3") == "2") + } + + test("build function") { + val testBuilderChild = TestBuilderChild() + val factoryTest: TestFactory = testBuilderChild.build + assert(factoryTest.helloTest == "helloTest") + } +} diff --git a/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/analyzer/CustomAnalyzerTest.scala b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/analyzer/CustomAnalyzerTest.scala new file mode 100644 index 000000000..594007f65 --- /dev/null +++ b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/analyzer/CustomAnalyzerTest.scala @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.analyzer + +import com.stratio.cassandra.lucene.BaseScalaTest +import com.stratio.cassandra.lucene.common.JsonSerializer +import com.stratio.cassandra.lucene.schema.analysis.AnalyzerBuilder +import org.apache.lucene.analysis.tokenattributes.{OffsetAttribute, CharTermAttribute} +import org.apache.lucene.analysis.Analyzer +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +/** + * Created by jpgilaberte on 24/05/17. + */ +@RunWith(classOf[JUnitRunner]) +class CustomAnalyzerTest extends BaseScalaTest { + + def assertBuildException (custom: String, source: String, result: Array[String], count: Array[Int]) = { + assertThrows[Throwable](assertCustomContents(custom, source, result, count)) + } + def assertCustomContents(custom: String, source: String, result: Array[String], count: Array[Int]) = { + + val analyzerBuilder: AnalyzerBuilder = JsonSerializer.fromString(custom, classOf[AnalyzerBuilder]) + val customAnalyzer: Analyzer = analyzerBuilder.analyzer() + val ts = customAnalyzer.tokenStream("dumy", source) + + ts.reset() + count.map(x => { + ts.incrementToken() + val res = ts.getAttribute(classOf[CharTermAttribute]) + assert(new String(res.buffer(), 0, res.length()).equals(result(x))) + }) + ts.end() + ts.close() + } + + test("CustomAnalyzer deserialize json exception") { + val custom = """{type:"custo"}""".stripMargin + assertBuildException(custom, "föó bär FÖÖ BAR", Array("föó", "bär", "FÖÖ", "BAR"), Array(0, 1, 2, 3)) + } + + test("CustomAnalyzer only tokenizer") { + val custom = """{type:"custom", tokenizer: {type:"whitespace"}}""".stripMargin + assertCustomContents(custom, "föó bär FÖÖ BAR", Array("föó", "bär", "FÖÖ", "BAR"), Array(0, 1, 2, 3)) + } + + test("CustomAnalyzer tokenizer mandatory exception") { + val custom = """{type:"custom"}""".stripMargin + assertBuildException(custom, "föó bär FÖÖ BAR", Array("föó", "bär", "FÖÖ", "BAR"), Array(0, 1, 2, 3)) + } + + test("CustomAnalyzer token_filter") { + val custom = """{type:"custom", tokenizer: {type:"whitespace"}, token_filter: [{type:"asciifolding"}, {type:"lowercase"}]}""".stripMargin + assertCustomContents(custom, "föó bär FÖÖ BAR", Array("foo", "bar", "foo", "bar"), Array(0, 1, 2, 3)) + } + + test("CustomAnalyzer token_filter empty") { + val custom = """{type:"custom", tokenizer: {type:"whitespace"}, token_filter: []}""".stripMargin + assertCustomContents(custom, "föó bär FÖÖ BAR", Array("föó", "bär", "FÖÖ", "BAR"), Array(0, 1, 2, 3)) + } + + test("CustomAnalyzer token_filter json exception") { + val custom = """{type:"custom", tokenizer: {type:"whitespace"}, token_filter: }""".stripMargin + assertBuildException(custom, "föó bär FÖÖ BAR", Array("föó", "bär", "FÖÖ", "BAR"), Array(0, 1, 2, 3)) + } + + test("CustomAnalyzer char_filter") { + val custom = """{type:"custom", tokenizer: {type:"whitespace"}, char_filter: [{type:"mapping", mapping:"MappingCharFilter"}, {type:"patternreplace", pattern:"(a)\\s+(b)", replacement:"$1#$2"}]}""".stripMargin + assertCustomContents(custom, "aa bb aa bb", Array("a#b", "a#b"), Array(0, 1)) + } + + test("CustomAnalyzer char_filter empty") { + val custom = """{type:"custom", tokenizer: {type:"whitespace"}, char_filter: []}""".stripMargin + assertCustomContents(custom, "a f ff aa", Array("a", "f", "ff", "aa"), Array(0, 1, 2, 3)) + } + + test("CustomAnalyzer full") { + val custom = """{type:"custom", tokenizer: {type:"whitespace"}, char_filter: [{type:"mapping", mapping:"MappingCharFilter"}, {type:"patternreplace", pattern:"(zöó)\\s+(zöó)", replacement:"$1#$2"}], token_filter: [{type:"asciifolding"}, {type:"lowercase"}]}""".stripMargin + assertCustomContents(custom, "föó föó bär FÖÖ BAR", Array("zoo#zoo", "bar", "zoo", "bar"), Array(0, 1, 2, 3)) + } + + test("CustomAnalyzer validate before 'cassandra index build'") { + val custom = """{type:"custom", tokenizer: {type:"ngram", "max_gram_size": 1, "min_gram_size": 2}}""".stripMargin + assertBuildException(custom, "aabb", Array("aa", "ab", "bb"), Array(0, 1)) + } +} diff --git a/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilderTest.scala b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilderTest.scala new file mode 100644 index 000000000..b301a9d8f --- /dev/null +++ b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilderTest.scala @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.charFilter + +import java.io.StringReader + +import com.stratio.cassandra.lucene.BaseScalaTest +import com.stratio.cassandra.lucene.common.JsonSerializer +import org.apache.lucene.analysis.util.CharFilterFactory +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +/** Tests for [[CharFilterBuilder]]. + * + * @author Juan Pedro Gilaberte `jpgilaberte@stratio.com` + */ +@RunWith(classOf[JUnitRunner]) +class CharFilterBuilderTest extends BaseScalaTest { + + test("available charFilter"){ + assert(CharFilterFactory.availableCharFilters().size() == 4) + } + + type T = CharFilterBuilder[CharFilterFactory] + + def assertBuild(tokenizerBuilder: T, factoryClass: String, tokenizerClass: String) = { + val tokenizerFactory = tokenizerBuilder.build + val tokenizer = tokenizerFactory.create(new StringReader("hello test")) + assert(tokenizerFactory.getClass.getSimpleName == factoryClass) + assert(tokenizer.getClass.getSimpleName == tokenizerClass) + } + + def assertBuildException (tokenizerBuilder: T) = assertThrows[RuntimeException](tokenizerBuilder.build.create(new StringReader("hello test"))) + + test("HtmlStripCharFilterBuilder") { + val jsonTest1 = """{"type":"htmlstrip"}""" + val factoryName = "HTMLStripCharFilterFactory" + val tokenizerName = "HTMLStripCharFilter" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[HtmlStripCharFilterBuilder]), factoryName, tokenizerName) + } + + test("MappingCharFilterBuilder") { + val jsonTest1 = """{type:"mapping", mapping: "MappingCharFilter"}""" + val factoryName = "MappingCharFilterFactory" + val tokenizerName = "StringReader" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[MappingCharFilterBuilder]), factoryName, tokenizerName) + } + + test("PatternReplaceCharFilterBuilder") { + val jsonTest1 = """{"type":"patternreplace", "pattern":"/W+", "replacement":"test"}""" + val jsonTest2 = """{"type":"patternreplace"}""" + val factoryName = "PatternReplaceCharFilterFactory" + val tokenizerName = "PatternReplaceCharFilter" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[PatternReplaceCharFilterBuilder]), factoryName, tokenizerName) + assertBuildException(JsonSerializer.fromString(jsonTest2, classOf[PatternReplaceCharFilterBuilder])) + } + + test("PersianCharFilterBuilder") { + val jsonTest1 = """{"type":"persian"}""" + val factoryName = "PersianCharFilterFactory" + val tokenizerName = "PersianCharFilter" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[PersianCharFilterBuilder]), factoryName, tokenizerName) + } +} \ No newline at end of file diff --git a/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilderTest.scala b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilderTest.scala new file mode 100644 index 000000000..82b2c6d43 --- /dev/null +++ b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilderTest.scala @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.schema.analysis.tokenFilter + +import com.stratio.cassandra.lucene.BaseScalaTest +import com.stratio.cassandra.lucene.common.JsonSerializer +import org.apache.lucene.analysis.TokenStream +import org.apache.lucene.analysis.util.TokenFilterFactory +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +/** Tests for [[TokenFilterBuilder]]. + * + * @author Juan Pedro Gilaberte `jpgilaberte@stratio.com` + */ +@RunWith(classOf[JUnitRunner]) +class TokenFilterBuilderTest extends BaseScalaTest { + + test("available charFilter"){ + assert(TokenFilterFactory.availableTokenFilters().size() == 90) + } + + type T = TokenFilterBuilder[TokenFilterFactory] + + def assertBuild(tokenizerBuilder: T, factoryClass: String, tokenizerClass: String) = { + val tokenizerFactory = tokenizerBuilder.build + val tokenizer = tokenizerFactory.create(new TokenStream() {override def incrementToken(): Boolean = true}) + assert(tokenizerFactory.getClass.getSimpleName == factoryClass) + assert(tokenizer.getClass.getSimpleName == tokenizerClass) + } + + def assertBuildException (tokenizerBuilder: T) = assertThrows[RuntimeException](tokenizerBuilder.build.create(new TokenStream() {override def incrementToken(): Boolean = true})) + + test("StandardFilterBuilder") { + val jsonTest1 = """{"type":"standard"}""" + val factoryName = "StandardFilterFactory" + val tokenizerName = "StandardFilter" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[StandardTokenFilterBuilder]), factoryName, tokenizerName) + } + + test("ApostropheFilterBuilder") { + val jsonTest1 = """{"type":"apostrophe"}""" + val factoryName = "ApostropheFilterFactory" + val tokenizerName = "ApostropheFilter" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[ApostropheTokenFilterBuilder]), factoryName, tokenizerName) + } +} diff --git a/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.scala b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.scala index e3c2d2aff..99dec2d07 100644 --- a/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.scala +++ b/plugin/src/test/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilderTest.scala @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.path.{ReversePathHierarchyTokenizer, PathHiera import org.apache.lucene.analysis.pattern.PatternTokenizer import org.apache.lucene.analysis.standard.{UAX29URLEmailTokenizer, StandardTokenizer, ClassicTokenizer} import org.apache.lucene.analysis.th.ThaiTokenizer +import org.apache.lucene.analysis.util.TokenizerFactory import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @@ -33,231 +34,154 @@ import scala.util.Try * * @author Juan Pedro Gilaberte `jpgilaberte@stratio.com` */ - @RunWith(classOf[JUnitRunner]) class TokenizerBuilderTest extends BaseScalaTest{ - def failFlow(throwable: Throwable) = fail(throwable.getMessage, throwable) - - def buildAbstractBuilder(json: String, builderClass: Class[_]): Any = Try(JsonSerializer.fromString(json, builderClass)).fold(failFlow, x=>x) - - test("ClassicTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"classic\", max_token_length: 1}", - classOf[TokenizerBuilder[ClassicTokenizer]]).asInstanceOf[TokenizerBuilder[ClassicTokenizer]] - assert(classOf[ClassicTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[ClassicTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - assert(1 == tokenizer.getMaxTokenLength) - } - - test("ClassicTokenizerBuilder parse JSON throw IllegalArgumentException") { - val abstractBuilder = buildAbstractBuilder("{type: \"classic\", max_token_length: 0}", - classOf[TokenizerBuilder[ClassicTokenizer]]).asInstanceOf[TokenizerBuilder[ClassicTokenizer]] - assert(classOf[ClassicTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} - } - - test("ClassicTokenizerBuilder parse JSON default values") { - val abstractBuilder = buildAbstractBuilder("{type: \"classic\"}", - classOf[TokenizerBuilder[ClassicTokenizer]]).asInstanceOf[TokenizerBuilder[ClassicTokenizer]] - assert(classOf[ClassicTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[ClassicTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - assert(ClassicTokenizerBuilder.DEFAULT_MAX_TOKEN_LENGTH == tokenizer.getMaxTokenLength) - } - - test("EdgeNGramTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"edge_ngram\", min_gram: 1, max_gram: 2}", - classOf[TokenizerBuilder[EdgeNGramTokenizer]]).asInstanceOf[TokenizerBuilder[EdgeNGramTokenizer]] - assert(classOf[EdgeNGramTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[EdgeNGramTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - assert(1 == abstractBuilder.asInstanceOf[EdgeNGramTokenizerBuilder].minGram) - assert(2 == abstractBuilder.asInstanceOf[EdgeNGramTokenizerBuilder].maxGram) - } - - test("EdgeNGramTokenizerBuilder parse JSON throws IllegalArgumentException") { - val abstractBuilder = buildAbstractBuilder("{type: \"edge_ngram\", min_gram: -1, max_gram: 2}", - classOf[TokenizerBuilder[EdgeNGramTokenizer]]).asInstanceOf[TokenizerBuilder[EdgeNGramTokenizer]] - assert(classOf[EdgeNGramTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} - } - - test("KeywordTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"keyword\", buffer_size: 256}", - classOf[TokenizerBuilder[KeywordTokenizer]]).asInstanceOf[TokenizerBuilder[KeywordTokenizer]] - assert(classOf[KeywordTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[KeywordTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - assert(256 == abstractBuilder.asInstanceOf[KeywordTokenizerBuilder].bufferSize) - } - - test("KeywordTokenizerBuilder parse JSON throw IllegalArgumentException") { - val abstractBuilder = buildAbstractBuilder("{type: \"keyword\", buffer_size: -256}", - classOf[TokenizerBuilder[ClassicTokenizer]]).asInstanceOf[TokenizerBuilder[ClassicTokenizer]] - assert(classOf[KeywordTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} - } - - test("LetterTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"letter\"}", - classOf[TokenizerBuilder[LetterTokenizer]]).asInstanceOf[TokenizerBuilder[LetterTokenizer]] - assert(classOf[LetterTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[LetterTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - } - - test("LowerCaseTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"lower_case\"}", - classOf[TokenizerBuilder[LowerCaseTokenizer]]).asInstanceOf[TokenizerBuilder[LowerCaseTokenizer]] - assert(classOf[LowerCaseTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[LowerCaseTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - } - - test("NGramTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"ngram\", min_gram: 1, max_gram: 2}", - classOf[TokenizerBuilder[NGramTokenizer]]).asInstanceOf[TokenizerBuilder[NGramTokenizer]] - assert(classOf[NGramTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[NGramTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - assert(1 == abstractBuilder.asInstanceOf[NGramTokenizerBuilder].minGram) - assert(2 == abstractBuilder.asInstanceOf[NGramTokenizerBuilder].maxGram) - } - - test("NGramTokenizerBuilder parse JSON throws IllegalArgumentException") { - val abstractBuilder = buildAbstractBuilder("{type: \"ngram\", min_gram: -1, max_gram: 2}", - classOf[TokenizerBuilder[NGramTokenizer]]).asInstanceOf[TokenizerBuilder[NGramTokenizer]] - assert(classOf[NGramTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} - } - - test("PathHierarchyTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"path_hierarchy\", buffer_size: 246, delimiter: \"$\", replacement: \"%\", skip: 3}", - classOf[TokenizerBuilder[PathHierarchyTokenizer]]).asInstanceOf[TokenizerBuilder[PathHierarchyTokenizer]] - assert(classOf[PathHierarchyTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[PathHierarchyTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - assert(246 == abstractBuilder.asInstanceOf[PathHierarchyTokenizerBuilder].bufferSize) - assert('$' == abstractBuilder.asInstanceOf[PathHierarchyTokenizerBuilder].delimiter) - assert('%' == abstractBuilder.asInstanceOf[PathHierarchyTokenizerBuilder].replacement) - assert(3 == abstractBuilder.asInstanceOf[PathHierarchyTokenizerBuilder].skip) - } - - test("PathHierarchyTokenizerBuilder parse JSON throws IllegalArgumentException") { - val abstractBuilder = buildAbstractBuilder("{type: \"path_hierarchy\", buffer_size: 246, delimiter: \"$\", replacement: \"%\", skip: -3}", - classOf[TokenizerBuilder[PathHierarchyTokenizer]]).asInstanceOf[TokenizerBuilder[PathHierarchyTokenizer]] - assert(classOf[PathHierarchyTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} - } - - test("PatternTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"pattern\", pattern: \"[a-z]\", flags: 35, group: 0}", - classOf[TokenizerBuilder[PatternTokenizer]]).asInstanceOf[TokenizerBuilder[PatternTokenizer]] - assert(classOf[PatternTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[PatternTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - assert("[a-z]" == abstractBuilder.asInstanceOf[PatternTokenizerBuilder].pattern) - assert(35 == abstractBuilder.asInstanceOf[PatternTokenizerBuilder].flags) - assert(0 == abstractBuilder.asInstanceOf[PatternTokenizerBuilder].group) - } - - test("PatternTokenizerBuilder parse JSON throws IllegalArgumentException") { - val abstractBuilder = buildAbstractBuilder("{type: \"pattern\", pattern: \"[a-z]\", flags: 35, group: 2}", - classOf[TokenizerBuilder[PatternTokenizer]]).asInstanceOf[TokenizerBuilder[PatternTokenizer]] - assert(classOf[PatternTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} - } - - test("ReversePathHierarchyTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacement: \"%\", skip: 3}", - classOf[TokenizerBuilder[ReversePathHierarchyTokenizer]]).asInstanceOf[TokenizerBuilder[ReversePathHierarchyTokenizer]] - assert(classOf[ReversePathHierarchyTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[ReversePathHierarchyTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - assert(246 == abstractBuilder.asInstanceOf[ReversePathHierarchyTokenizerBuilder].bufferSize) - assert('/' == abstractBuilder.asInstanceOf[ReversePathHierarchyTokenizerBuilder].delimiter) - assert('%' == abstractBuilder.asInstanceOf[ReversePathHierarchyTokenizerBuilder].replacement) - assert(3 == abstractBuilder.asInstanceOf[ReversePathHierarchyTokenizerBuilder].skip) - } - - test("ReversePathHierarchyTokenizerBuilder parse JSON throws IllegalArgumentException") { - val abstractBuilder = buildAbstractBuilder("{type: \"reverse_path_hierarchy\", buffer_size: 246, delimiter: \"/\", replacement: \"%\", skip: -3}", - classOf[TokenizerBuilder[ReversePathHierarchyTokenizer]]).asInstanceOf[TokenizerBuilder[ReversePathHierarchyTokenizer]] - assert(classOf[ReversePathHierarchyTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} - } - - test("StandardTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"standard\", max_token_length: 246}", - classOf[TokenizerBuilder[StandardTokenizer]]).asInstanceOf[TokenizerBuilder[StandardTokenizer]] - assert(classOf[StandardTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[StandardTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - assert(246 == abstractBuilder.asInstanceOf[StandardTokenizerBuilder].maxTokenLength) - } - - test("StandardTokenizerBuilder parse JSON throws IllegalArgumentException") { - val abstractBuilder = buildAbstractBuilder("{type: \"standard\", max_token_length: -246}", - classOf[TokenizerBuilder[StandardTokenizer]]).asInstanceOf[TokenizerBuilder[StandardTokenizer]] - assert(classOf[StandardTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} - } - - test("ThaiTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"thai\"}", - classOf[TokenizerBuilder[ThaiTokenizer]]).asInstanceOf[TokenizerBuilder[ThaiTokenizer]] - assert(classOf[ThaiTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[ThaiTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - } - - test("UAX29URLEmailTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"uax29_url_email\", max_token_length: 249}", - classOf[TokenizerBuilder[UAX29URLEmailTokenizer]]).asInstanceOf[TokenizerBuilder[UAX29URLEmailTokenizer]] - assert(classOf[UAX29URLEmailTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[UAX29URLEmailTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - assert(249 == abstractBuilder.asInstanceOf[UAX29URLEmailTokenizerBuilder].maxTokenLength) - } - - test("UAX29URLEmailTokenizerBuilder parse JSON throws IllegalArgumentException") { - val abstractBuilder = buildAbstractBuilder("{type: \"uax29_url_email\", max_token_length: -249}", - classOf[TokenizerBuilder[UAX29URLEmailTokenizer]]).asInstanceOf[TokenizerBuilder[UAX29URLEmailTokenizer]] - assert(classOf[UAX29URLEmailTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - assertThrows[IllegalArgumentException]{abstractBuilder.buildTokenizer} - } - - test("UnicodeWhitespaceTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type:\"unicode_whitespace\"}", - classOf[TokenizerBuilder[UnicodeWhitespaceTokenizer]]).asInstanceOf[TokenizerBuilder[UnicodeWhitespaceTokenizer]] - assert(classOf[UnicodeWhitespaceTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[UnicodeWhitespaceTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - } - - test("WhitespaceTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type:\"whitespace\"}", - classOf[TokenizerBuilder[WhitespaceTokenizer]]).asInstanceOf[TokenizerBuilder[WhitespaceTokenizer]] - assert(classOf[WhitespaceTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[WhitespaceTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - } - - test("WikipediaTokenizerBuilder parse JSON") { - val abstractBuilder = buildAbstractBuilder("{type: \"wikipedia\", token_output: \"TOKENS_ONLY\", untokenized_types : [\"aaa\",\"bbb\"]}", - classOf[TokenizerBuilder[WikipediaTokenizer]]).asInstanceOf[TokenizerBuilder[WikipediaTokenizer]] - assert(classOf[WikipediaTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - val tokenizer = abstractBuilder.buildTokenizer - assert(classOf[WikipediaTokenizer].getCanonicalName == tokenizer.getClass.getCanonicalName) - assert(TokenOutputEnum.TOKENS_ONLY.toString == abstractBuilder.asInstanceOf[WikipediaTokenizerBuilder].tokenOutput) - assert(Array("aaa", "bbb").apply(1) == abstractBuilder.asInstanceOf[WikipediaTokenizerBuilder].untokenizedTypes.apply(1)) - } - - test("WikipediaTokenizerBuilder parse JSON throws NoSuchElementException") { - val abstractBuilder = buildAbstractBuilder("{type: \"wikipedia\", token_output: \"OKENS_ONLY\", untokenized_types : [\"aaa\",\"bbb\"]}", - classOf[TokenizerBuilder[WikipediaTokenizer]]).asInstanceOf[TokenizerBuilder[WikipediaTokenizer]] - assert(classOf[WikipediaTokenizerBuilder].getCanonicalName == abstractBuilder.getClass.getCanonicalName) - assertThrows[NoSuchElementException]{abstractBuilder.buildTokenizer} + test("available tokenizer"){ + assert(TokenizerFactory.availableTokenizers().size() == 13) + } + + type T = TokenizerBuilder[TokenizerFactory] + + def assertBuild(tokenizerBuilder: T, factoryClass: String, tokenizerClass: String) = { + val tokenizerFactory = tokenizerBuilder.build + val tokenizer = tokenizerFactory.create() + assert(tokenizerFactory.getClass.getSimpleName == factoryClass) + assert(tokenizer.getClass.getSimpleName == tokenizerClass) + } + + def assertBuildException (tokenizerBuilder: T) = assertThrows[RuntimeException](tokenizerBuilder.build.create()) + + test("ClassicTokenizerBuilder") { + val jsonTest1 = """{"type":"classic", "max_token_length": 25}""" + val jsonTest2 = """{"type":"classic"}""" + val jsonTest3 = """{"type":"classic", "max_token_length": -25}""" + val factoryName = "ClassicTokenizerFactory" + val tokenizerName = "ClassicTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[ClassicTokenizerBuilder]), factoryName, tokenizerName) + assertBuild(JsonSerializer.fromString(jsonTest2, classOf[ClassicTokenizerBuilder]), factoryName, tokenizerName) + assertBuildException(JsonSerializer.fromString(jsonTest3, classOf[ClassicTokenizerBuilder])) + } + + test("EdgeNGramTokenizerBuilder") { + val jsonTest1 = """{"type":"edge_ngram", "max_gram_size": 25, "min_gram_size": 10}""" + val jsonTest2 = """{"type":"edge_ngram"}""" + val jsonTest3 = """{"type":"edge_ngram", "max_gram_size": 25, "min_gram_size": -10}""" + val factoryName = "EdgeNGramTokenizerFactory" + val tokenizerName = "EdgeNGramTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[EdgeNGramTokenizerBuilder]), factoryName, tokenizerName) + assertBuild(JsonSerializer.fromString(jsonTest2, classOf[EdgeNGramTokenizerBuilder]), factoryName, tokenizerName) + assertBuildException(JsonSerializer.fromString(jsonTest3, classOf[EdgeNGramTokenizerBuilder])) + } + + test("KeywordTokenizerBuilder") { + val jsonTest1 = """{"type":"keyword"}""" + val factoryName = "KeywordTokenizerFactory" + val tokenizerName = "KeywordTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[KeywordTokenizerBuilder]), factoryName, tokenizerName) + } + + test("LetterTokenizerBuilder") { + val jsonTest1 = """{"type":"letter"}""" + val factoryName = "LetterTokenizerFactory" + val tokenizerName = "LetterTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[LetterTokenizerBuilder]), factoryName, tokenizerName) + } + + test("LowerCaseTokenizerBuilder") { + val jsonTest1 = """{"type":"lower_case"}""" + val factoryName = "LowerCaseTokenizerFactory" + val tokenizerName = "LowerCaseTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[LowerCaseTokenizerBuilder]), factoryName, tokenizerName) + } + + test("NGramTokenizerBuilder") { + val jsonTest1 = """{"type":"ngram", "max_gram_size": 25, "min_gram_size": 10}""" + val jsonTest2 = """{"type":"ngram"}""" + val jsonTest3 = """{"type":"ngram", "max_gram_size": 25, "min_gram_size": -10}""" + val factoryName = "NGramTokenizerFactory" + val tokenizerName = "NGramTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[NGramTokenizerBuilder]), factoryName, tokenizerName) + assertBuild(JsonSerializer.fromString(jsonTest2, classOf[NGramTokenizerBuilder]), factoryName, tokenizerName) + assertBuildException(JsonSerializer.fromString(jsonTest3, classOf[NGramTokenizerBuilder])) + } + + test("PathHierarchyTokenizerBuilder") { + val jsonTest1 = """{"type":"path_hierarchy", "reverse": false, "delimiter": "/", "replace": "%", skip: 3}""" + val jsonTest2 = """{"type":"path_hierarchy"}""" + val jsonTest3 = """{"type":"path_hierarchy", "reverse": false, "delimiter": "/", "replace": "%", skip: -3}""" + val factoryName = "PathHierarchyTokenizerFactory" + val tokenizerName = "PathHierarchyTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[PathHierarchyTokenizerBuilder]), factoryName, tokenizerName) + assertBuild(JsonSerializer.fromString(jsonTest2, classOf[PathHierarchyTokenizerBuilder]), factoryName, tokenizerName) + assertBuildException(JsonSerializer.fromString(jsonTest3, classOf[PathHierarchyTokenizerBuilder])) + + val jsonTest4 = """{"type":"path_hierarchy", "reverse": true, "delimiter": "/", "replace": "%", skip: 3}""" + val jsonTest5 = """{"type":"path_hierarchy", "reverse": true, "delimiter": "/", "replace": "%", skip: -3}""" + val tokenizerNameReverse = "ReversePathHierarchyTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest4, classOf[PathHierarchyTokenizerBuilder]), factoryName, tokenizerNameReverse) + assertBuildException(JsonSerializer.fromString(jsonTest5, classOf[PathHierarchyTokenizerBuilder])) + } + + test("PatternTokenizerBuilder") { + val jsonTest1 = """{"type":"pattern", pattern: "[a-z]", group: 0}""" + val jsonTest2 = """{"type":"pattern", pattern: "[a-z]", group: 0}""" + val jsonTest3 = """{"type":"pattern"}""" + val factoryName = "PatternTokenizerFactory" + val tokenizerName = "PatternTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[PatternTokenizerBuilder]), factoryName, tokenizerName) + assertBuild(JsonSerializer.fromString(jsonTest2, classOf[PatternTokenizerBuilder]), factoryName, tokenizerName) + assertBuildException(JsonSerializer.fromString(jsonTest3, classOf[PatternTokenizerBuilder])) + } + + test("StandardTokenizerBuilder") { + val jsonTest1 = """{"type":"standard", "max_token_length": 25}""" + val jsonTest2 = """{"type":"standard"}""" + val jsonTest3 = """{"type":"standard", "max_token_length": -25}""" + val factoryName = "StandardTokenizerFactory" + val tokenizerName = "StandardTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[StandardTokenizerBuilder]), factoryName, tokenizerName) + assertBuild(JsonSerializer.fromString(jsonTest2, classOf[StandardTokenizerBuilder]), factoryName, tokenizerName) + assertBuildException(JsonSerializer.fromString(jsonTest3, classOf[StandardTokenizerBuilder])) + } + + test("ThaiTokenizerBuilder") { + val jsonTest1 = """{"type":"thai"}""" + val factoryName = "ThaiTokenizerFactory" + val tokenizerName = "ThaiTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[ThaiTokenizerBuilder]), factoryName, tokenizerName) + } + + test("UAX29URLEmailTokenizerBuilder") { + val jsonTest1 = """{"type":"uax29_url_email", "max_token_length": 25}""" + val jsonTest2 = """{"type":"uax29_url_email"}""" + val jsonTest3 = """{"type":"uax29_url_email", "max_token_length": -25}""" + val factoryName = "UAX29URLEmailTokenizerFactory" + val tokenizerName = "UAX29URLEmailTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[UAX29URLEmailTokenizerBuilder]), factoryName, tokenizerName) + assertBuild(JsonSerializer.fromString(jsonTest2, classOf[UAX29URLEmailTokenizerBuilder]), factoryName, tokenizerName) + assertBuildException(JsonSerializer.fromString(jsonTest3, classOf[UAX29URLEmailTokenizerBuilder])) + } + + test("WhiteSpaceTokenizerBuilder") { + val jsonTest1 = """{"type":"whitespace", "rule": "java"}""" + val jsonTest2 = """{"type":"whitespace"}""" + val jsonTest3 = """{"type":"whitespace", "rule": "failure"}""" + val factoryName = "WhitespaceTokenizerFactory" + val tokenizerName = "WhitespaceTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[WhitespaceTokenizerBuilder]), factoryName, tokenizerName) + assertBuild(JsonSerializer.fromString(jsonTest2, classOf[WhitespaceTokenizerBuilder]), factoryName, tokenizerName) + assertBuildException(JsonSerializer.fromString(jsonTest3, classOf[WhitespaceTokenizerBuilder])) + + val jsonTest4 = """{"type":"whitespace", "rule": "unicode"}""" + val tokenizerNameUnicode = "UnicodeWhitespaceTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest4, classOf[WhitespaceTokenizerBuilder]), factoryName, tokenizerNameUnicode) + } + + test("WikipediaTokenizerBuilder") { + val jsonTest1 = """{"type":"wikipedia"}""" + val factoryName = "WikipediaTokenizerFactory" + val tokenizerName = "WikipediaTokenizer" + assertBuild(JsonSerializer.fromString(jsonTest1, classOf[WikipediaTokenizerBuilder]), factoryName, tokenizerName) } } From c98e3fc5e83fc0d21029da4bc28fd9ce959cfdc8 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Fri, 2 Jun 2017 12:15:55 +0200 Subject: [PATCH 15/40] Add testAt CustomAnalizer --- .../analysis/tokenizer/CustomAnalyzerIT.java | 63 +++++++++++++++++++ .../tokenizer/TokenizerBuilderIT.java | 28 ++++++--- 2 files changed, 84 insertions(+), 7 deletions(-) create mode 100644 testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/CustomAnalyzerIT.java diff --git a/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/CustomAnalyzerIT.java b/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/CustomAnalyzerIT.java new file mode 100644 index 000000000..9c8958bc1 --- /dev/null +++ b/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/CustomAnalyzerIT.java @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.testsAT.schema.analysis.tokenizer; + +import com.stratio.cassandra.lucene.builder.index.schema.analysis.tokenizer.NGramTokenizer; +import com.stratio.cassandra.lucene.testsAT.BaseIT; +import com.stratio.cassandra.lucene.testsAT.util.CassandraUtils; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +import static com.stratio.cassandra.lucene.builder.Builder.*; +import static com.stratio.cassandra.lucene.builder.Builder.match; + +/** + * Created by jpgilaberte on 2/06/17. + */ +@RunWith(JUnit4.class) +public class CustomAnalyzerIT extends BaseIT { + private static CassandraUtils utils; + + @BeforeClass + public static void before() {} + + @AfterClass + public static void after() { + CassandraUtils.dropKeyspaceIfNotNull(utils); + } + + @Test + public void testCustomAnalyzer() { + utils = CassandraUtils.builder("tokenizer") + .withPartitionKey("pk") + .withColumn("pk", "int") + .withColumn("rc", "text", textMapper().analyzer("en")) + .withAnalyzer("en", customAnalyzer(new NGramTokenizer(2,2))) + .build() + .createKeyspace() + .createTable() + .insert("pk,rc", 1, "aabb") + .createIndex().refresh() + .filter(all()).check(1) + .filter(none()).check(0) + .filter(match("rc", "aa")).check(1) + .filter(match("rc", "ab")).check(1) + .filter(match("rc", "bb")).check(1); + } +} diff --git a/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java b/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java index d3ce7352f..a16516b45 100644 --- a/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java +++ b/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java @@ -44,6 +44,25 @@ public static void after() { CassandraUtils.dropKeyspaceIfNotNull(utils); } +// @Test +// public void testClassicTokenizer() { +// utils = CassandraUtils.builder("tokenizer") +// .withPartitionKey("pk") +// .withColumn("pk", "int") +// .withColumn("rc", "text", textMapper().analyzer("en")) +// .withAnalyzer("en", customAnalyzer(new NGramTokenizer(2,1), +// null, +// null)) +// .build() +// .createKeyspace() +// .createTable() +// .insert("pk,rc", 1, "aabb") +// .createIndex().refresh() +// .filter(all()).check(1) +// .filter(none()).check(0) +// .filter(match("rc", "aa")).check(1) +// .filter(match("rc", "ab")).check(1); +// } @Test public void testClassicTokenizer() { utils = CassandraUtils.builder("tokenizer") @@ -218,7 +237,6 @@ public void testLetterTokenizer() { .filter(phrase("rc", "The 2 QUICK Brown-Foxes jumped the")).check(1) .filter(fuzzy("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/o")).check(0) .filter(fuzzy("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone.")).check(0) - //TODO: check this behaviour .filter(contains("rc", "The 2 QUICK Brown-Foxes")).check(1) .filter(contains("rc", "jump")).check(0) .filter(contains("rc", "and/or")).check(1) @@ -249,7 +267,6 @@ public void testLowerCaseTokenizer() { .filter(phrase("rc", "The 2 QUICK Brown-Foxes jumped the")).check(1) .filter(fuzzy("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone. and/o")).check(0) .filter(fuzzy("rc", "The 2 QUICK Brown-Foxes jumped the lazy dog bone.")).check(0) - //TODO: check this behaviour .filter(contains("rc", "The 2 QUICK Brown-Foxes")).check(1) .filter(contains("rc", "jump")).check(0) .filter(contains("rc", "and/or")).check(1) @@ -385,7 +402,7 @@ public void testReversePathHierarchyTokenizer() { .withPartitionKey("pk") .withColumn("pk", "int") .withColumn("rc", "text", textMapper().analyzer("en")) - .withAnalyzer("en", customAnalyzer(new ReversePathHierarchyTokenizer())) + .withAnalyzer("en", customAnalyzer(new PathHierarchyTokenizer(true, '/', '/', 0))) .build() .createKeyspace() .createTable() @@ -551,7 +568,7 @@ public void testUnicodeWhiteSpaceTokenizerTokenizer() { .withPartitionKey("pk") .withColumn("pk", "int") .withColumn("rc", "text", textMapper().analyzer("en")) - .withAnalyzer("en", customAnalyzer(new UnicodeWhitespaceTokenizer())) + .withAnalyzer("en", customAnalyzer(new WhitespaceTokenizer("unicode"))) .build() .createKeyspace() .createTable() @@ -590,7 +607,6 @@ public void testUnicodeWhiteSpaceTokenizerTokenizer() { .filter(fuzzy("rc", "gjumperd")).check(1) .filter(fuzzy("rc", "dogjumperdog")).check(0) .filter(contains("rc", "jumped")).check(1) - //TODO: check this behaviour .filter(contains("rc", "jump")).check(0) .filter(contains("rc", "jumper")).check(0) .filter(contains("rc", "ajumped")).check(0) @@ -653,7 +669,6 @@ public void testWhiteSpaceTokenizerTokenizer() { .filter(fuzzy("rc", "gjumperd")).check(1) .filter(fuzzy("rc", "dogjumperdog")).check(0) .filter(contains("rc", "jumped")).check(1) - //TODO: check this behaviour .filter(contains("rc", "jump")).check(0) .filter(contains("rc", "jumper")).check(0) .filter(contains("rc", "ajumped")).check(0) @@ -707,4 +722,3 @@ public void testWikipediaTokenizerTokenizer() { .filter(contains("rc", "sub head followed by some text")).check(1); } } - From 3461943a21268b412209575905744d64b30dbf25 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Wed, 7 Jun 2017 13:22:35 +0200 Subject: [PATCH 16/40] Add JavaDoc in builder --- .../cassandra/lucene/builder/Builder.java | 16 ++-- .../index/schema/analysis/CustomAnalyzer.java | 8 +- .../analysis/tokenizer/KeywordTokenizer.java | 3 + .../analysis/tokenizer/NGramTokenizer.java | 4 +- .../tokenizer/PathHierarchyTokenizer.java | 4 +- .../analysis/tokenizer/PatternTokenizer.java | 2 - .../schema/analysis/tokenizer/Tokenizer.java | 3 + .../tokenizer/WhitespaceTokenizer.java | 5 +- .../tokenizer/WikipediaTokenizer.java | 76 +------------------ plugin/pom.xml | 5 -- .../analysis/tokenizer/CustomAnalyzerIT.java | 4 +- .../tokenizer/TokenizerBuilderIT.java | 25 +----- 12 files changed, 38 insertions(+), 117 deletions(-) diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java index fe0f905ca..c99de5e86 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java @@ -258,15 +258,15 @@ public static SnowballAnalyzer snowballAnalyzer(String language) { } /** - * Returns a new {@link CustomAnalyzer} for the specified language and stopwords. + * Returns a new {@link CustomAnalyzer} using custom tokenizer, char_filters and token_filters. * - * @param tokenizer - * @param charFilter - * @param tokenFiliter + * @param tokenizer an {@link Tokenizer} the tokenizer to use. + * @param charFilter an {@link CharFilter[]} the charFilter array to use. + * @param tokenFilter an {@link TokenFilter[]} the tokenFilter array to use. * @return */ - public static CustomAnalyzer customAnalyzer(Tokenizer tokenizer, CharFilter[] charFilter, TokenFilter[] tokenFiliter) { - return new CustomAnalyzer(tokenizer, charFilter, tokenFiliter); + public static CustomAnalyzer customAnalyzer(Tokenizer tokenizer, CharFilter[] charFilter, TokenFilter[] tokenFilter) { + return new CustomAnalyzer(tokenizer, charFilter, tokenFilter); } public static CustomAnalyzer customAnalyzer(Tokenizer tokenizer) { @@ -277,8 +277,8 @@ public static CustomAnalyzer customAnalyzer(Tokenizer tokenizer, CharFilter[] ch return new CustomAnalyzer(tokenizer, charFilter, null); } - public static CustomAnalyzer customAnalyzer(Tokenizer tokenizer, TokenFilter[] tokenFiliter) { - return new CustomAnalyzer(tokenizer, null, tokenFiliter); + public static CustomAnalyzer customAnalyzer(Tokenizer tokenizer, TokenFilter[] tokenFilter) { + return new CustomAnalyzer(tokenizer, null, tokenFilter); } /** diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java index 55840a988..c49e42a73 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java @@ -31,18 +31,24 @@ */ public class CustomAnalyzer extends Analyzer{ + /** The {@code Analyzer} full qualified class name. */ @JsonProperty("token_filter") private final TokenFilter[] tokenFilter; + /** The {@code Analyzer} full qualified class name. */ @JsonProperty("char_filter") private final CharFilter[] charFilter; + /** The {@code Analyzer} full qualified class name. */ @JsonProperty("tokenizer") private final Tokenizer tokenizer; + /** * Builds a new {@link CustomAnalyzer} using custom tokenizer, char_filters and token_filters. * - * @param tokenizer an {@link Tokenizer} the tookenizer to use. + * @param tokenizer an {@link Tokenizer} the tokenizer to use. + * @param charFilter an {@link CharFilter[]} the charFilter array to use. + * @param tokenFilter an {@link TokenFilter[]} the tokenFilter array to use. */ @JsonCreator public CustomAnalyzer(@JsonProperty("tokenizer") Tokenizer tokenizer, @JsonProperty("char_filter") CharFilter[] charFilter, diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java index 111d69d10..3120ae480 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/KeywordTokenizer.java @@ -27,6 +27,9 @@ */ public class KeywordTokenizer extends Tokenizer { + /** + * Builds a new {@link KeywordTokenizer}. + */ @JsonCreator public KeywordTokenizer() { } diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java index ebcf11875..cb4597bea 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/NGramTokenizer.java @@ -51,8 +51,8 @@ public NGramTokenizer() { /** * Builds a new {@link NGramTokenizer} using the specified minGramSize and manGram. * - * @param gramMinSize the smallest n-gram to generate - * @param gramMaxSize the largest n-gram to generate + * @param minGramSize the smallest n-gram to generate + * @param maxGramSize the largest n-gram to generate */ @JsonCreator public NGramTokenizer(@JsonProperty("min_gram_size") Integer minGramSize, diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java index 90c45198b..f221aa8a7 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PathHierarchyTokenizer.java @@ -70,11 +70,11 @@ public PathHierarchyTokenizer() { @JsonCreator public PathHierarchyTokenizer(@JsonProperty("reverse") Boolean reverse, @JsonProperty("delimiter") Character delimiter, - @JsonProperty("replace") Character replacement, + @JsonProperty("replace") Character replace, @JsonProperty("skip") Integer skip) { this.reverse = getOrDefault(reverse, REVERSE); this.delimiter = getOrDefault(delimiter, DEFAULT_DELIMITER); - this.replace = getOrDefault(replacement, DEFAULT_REPLACEMENT); + this.replace = getOrDefault(replace, DEFAULT_REPLACEMENT); this.skip = getOrDefault(skip, DEFAULT_SKIP); } } diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java index fbec32c8d..acae88333 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/PatternTokenizer.java @@ -51,12 +51,10 @@ public PatternTokenizer() { * Builds a new {@link PatternTokenizer} using the specified pattern, flags, and group. * * @param pattern java regular expression - * @param flags java regular expression flags * @param group a pattern group to use to generate tokens (-1 for split) */ @JsonCreator public PatternTokenizer(@JsonProperty("pattern") String pattern, - @JsonProperty("flags") Integer flags, @JsonProperty("group") Integer group) { this.pattern = getOrDefault(pattern, DEFAULT_PATTERN); this.group = getOrDefault(group, DEFAULT_GROUP); diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/Tokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/Tokenizer.java index 2a4e9e1c7..050bc32f1 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/Tokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/Tokenizer.java @@ -39,7 +39,10 @@ @JsonSubTypes.Type(value = WhitespaceTokenizer.class, name = "whitespace"), @JsonSubTypes.Type(value = WikipediaTokenizer.class, name = "wikipedia")}) public abstract class Tokenizer extends JSONBuilder { + /** + * + * * @param param the main parameter. * @param defaultParam the default parameter if main paramaeter is null. * @param return type must extend {@link Tokenizer} diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WhitespaceTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WhitespaceTokenizer.java index 35fe57c0d..5f20ce3a6 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WhitespaceTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WhitespaceTokenizer.java @@ -29,7 +29,7 @@ public class WhitespaceTokenizer extends Tokenizer { private final String RULE = "java"; - /** terms cache read buffer size */ + /** Rule for whiteSpace character [java|unicode] */ @JsonProperty("rule") final String rule; @@ -41,6 +41,9 @@ public WhitespaceTokenizer() { this.rule = RULE; } + /** + * Builds a new {@link WhitespaceTokenizer} with rule. + */ @JsonCreator public WhitespaceTokenizer(String rule) { this.rule = rule; diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java index 828462c80..cd6b560cf 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/tokenizer/WikipediaTokenizer.java @@ -30,80 +30,10 @@ */ public class WikipediaTokenizer extends Tokenizer { + /** + * Builds a new {@link WikipediaTokenizer} + */ @JsonCreator public WikipediaTokenizer() {} -// TODO: refactor wikipedia factory with advanced parameters -// static final TokenOutputValue DEFAULT_TOKEN_OUTPUT = TokenOutputValue.TOKENS_ONLY; -// static final Set DEFAULT_UNTOKENIZED_TYPES = Collections.emptySet(); -// -// static final int TOKENS_ONLY_VAR = 0; -// static final int UNTOKENIZED_ONLY_VAR = 1; -// static final int BOTH_VAR = 2; -// -// public enum TokenOutputValue { -// -// TOKENS_ONLY("TOKENS_ONLY", TOKENS_ONLY_VAR), -// UNTOKENIZED_ONLY("UNTOKENIZED_ONLY", UNTOKENIZED_ONLY_VAR), -// BOTH("BOTH", BOTH_VAR); -// -// private int integerValue; -// private String stringValue; -// -// TokenOutputValue(String name, int value) { -// this.stringValue = name; -// this.integerValue = value; -// } -// -// @JsonCreator -// public static TokenOutputValue create(String value) { -// if (value == null) { -// throw new IllegalArgumentException(); -// } -// for (TokenOutputValue v : values()) { -// if (v.getStringValue().equals(value)) { -// return v; -// } -// } -// throw new IllegalArgumentException(); -// } -// -// public int getIntegerValue() { -// return integerValue; -// } -// -// public String getStringValue() { -// return stringValue; -// } -// } -// -// /** this tokenizer output, only untokenized, only tokens or both */ -// @JsonProperty("token_output") -// final TokenOutputValue tokenOutput; -// /** //TODO */ -// @JsonProperty("untokenized_types") -// final Set untokenizedTypes; -// -// /** -// * Builds a new {@link WikipediaTokenizer} using the default tokenOutput and untokenizedTypes. -// * -// */ -// @JsonCreator -// public WikipediaTokenizer() { -// this.tokenOutput = DEFAULT_TOKEN_OUTPUT; -// this.untokenizedTypes = DEFAULT_UNTOKENIZED_TYPES; -// } -// -// /** -// * Builds a new {@link WikipediaTokenizer} using the specified tokenOutput and untokenizedTypes. -// * -// * @param tokenOutput this tokenizer output, only untokenized, only tokens or both -// * @param untokenizedTypes //TODO -// */ -// @JsonCreator -// public WikipediaTokenizer(@JsonProperty("token_output") WikipediaTokenizer.TokenOutputValue tokenOutput, -// @JsonProperty("untokenized_types") Set untokenizedTypes) { -// this.tokenOutput = getOrDefault(tokenOutput, DEFAULT_TOKEN_OUTPUT); -// this.untokenizedTypes = getOrDefault(untokenizedTypes, DEFAULT_UNTOKENIZED_TYPES); -// } } \ No newline at end of file diff --git a/plugin/pom.xml b/plugin/pom.xml index 95603a461..92caebfdb 100644 --- a/plugin/pom.xml +++ b/plugin/pom.xml @@ -97,11 +97,6 @@ jackson-databind ${jackson.version} - - com.fasterxml.jackson.module - jackson-module-scala_2.12 - 2.8.8 - org.mockito mockito-all diff --git a/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/CustomAnalyzerIT.java b/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/CustomAnalyzerIT.java index 9c8958bc1..464eef4bd 100644 --- a/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/CustomAnalyzerIT.java +++ b/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/CustomAnalyzerIT.java @@ -28,7 +28,9 @@ import static com.stratio.cassandra.lucene.builder.Builder.match; /** - * Created by jpgilaberte on 2/06/17. + * Test Custom Analyzer. + * + * @author Juan Pedro Gilaberte {@literal } */ @RunWith(JUnit4.class) public class CustomAnalyzerIT extends BaseIT { diff --git a/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java b/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java index a16516b45..a45c3b659 100644 --- a/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java +++ b/testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/schema/analysis/tokenizer/TokenizerBuilderIT.java @@ -27,9 +27,9 @@ import static com.stratio.cassandra.lucene.builder.Builder.*; /** - * Test partitioning on partition key column. + * Test Tokenizers. * - * @author Andres de la Pena {@literal } + * @author Juan Pedro Gilaberte {@literal } */ @RunWith(JUnit4.class) public class TokenizerBuilderIT extends BaseIT{ @@ -44,25 +44,6 @@ public static void after() { CassandraUtils.dropKeyspaceIfNotNull(utils); } -// @Test -// public void testClassicTokenizer() { -// utils = CassandraUtils.builder("tokenizer") -// .withPartitionKey("pk") -// .withColumn("pk", "int") -// .withColumn("rc", "text", textMapper().analyzer("en")) -// .withAnalyzer("en", customAnalyzer(new NGramTokenizer(2,1), -// null, -// null)) -// .build() -// .createKeyspace() -// .createTable() -// .insert("pk,rc", 1, "aabb") -// .createIndex().refresh() -// .filter(all()).check(1) -// .filter(none()).check(0) -// .filter(match("rc", "aa")).check(1) -// .filter(match("rc", "ab")).check(1); -// } @Test public void testClassicTokenizer() { utils = CassandraUtils.builder("tokenizer") @@ -365,7 +346,7 @@ public void testPatternTokenizer() { .withPartitionKey("pk") .withColumn("pk", "int") .withColumn("rc", "text", textMapper().analyzer("en")) - .withAnalyzer("en", customAnalyzer(new PatternTokenizer("/", 0, -1))) + .withAnalyzer("en", customAnalyzer(new PatternTokenizer("/", -1))) .build() .createKeyspace() .createTable() From 598f3144d4ca75ec5f87aa024bc28caf5cbb28b8 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Thu, 8 Jun 2017 13:00:01 +0200 Subject: [PATCH 17/40] Add ScalaDoc in plugin --- .../index/schema/analysis/CustomAnalyzer.java | 6 ++--- .../lucene/schema/analysis/Builder.scala | 27 +++++++++++-------- .../analyzer/CustomAnalyzerBuilder.scala | 11 ++++++-- .../charFilter/CharFilterBuilder.scala | 7 ++++- .../tokenFilter/TokenFilterBuilder.scala | 7 ++++- .../analysis/tokenizer/TokenizerBuilder.scala | 8 ++++++ 6 files changed, 48 insertions(+), 18 deletions(-) diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java index c49e42a73..5e5e400ea 100644 --- a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java +++ b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/CustomAnalyzer.java @@ -31,15 +31,15 @@ */ public class CustomAnalyzer extends Analyzer{ - /** The {@code Analyzer} full qualified class name. */ + /** The {@code TokenFilter} array. */ @JsonProperty("token_filter") private final TokenFilter[] tokenFilter; - /** The {@code Analyzer} full qualified class name. */ + /** The {@code CharFilter} array. */ @JsonProperty("char_filter") private final CharFilter[] charFilter; - /** The {@code Analyzer} full qualified class name. */ + /** The {@code Tokenizer} instance. */ @JsonProperty("tokenizer") private final Tokenizer tokenizer; diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/Builder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/Builder.scala index af6ef7a0a..bb5a52eb7 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/Builder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/Builder.scala @@ -20,30 +20,31 @@ import collection.JavaConverters._ import scala.reflect.runtime.{universe=>ru} /** + * Implements the necessary functionality so that a 'case class' with attributes that extends from it, + * can use reflection to construct a HashMap of attributes, necessary to build lucene factories + * * @author Juan Pedro Gilaberte jpgilaberte@stratio.com */ trait Builder[T] { /** - * Implement function to return Lucene object + * Implement function to return Lucene factory. Mandatory override in instance class. * - * @return + * @return the built factory */ def buildFunction : () => T - //TODO: refactor scala style (remove throw and manage exception in centralized layer) /** - * Auxiliary function to manage Java Exceptions (Lucene's layer) + * Assistant function to throw Java Exceptions (Lucene's layer) * * @param throwable - * @return */ def failThrowException(throwable: Throwable) = throw throwable /** - * Manage Java Exceptions (Lucene's layer) + * Wrap Java flow/exception (Lucene's layer) in Scala Style * - * @return the built analyzer + * @return the built factory */ def build: T = { import scala.util.control.Exception._ @@ -60,23 +61,27 @@ trait Builder[T] { def getOrDefault(param: Option[Any], defaultParam: Any): Any = param.map(x => x).getOrElse(defaultParam) /** + * Assistant function that return {@link TermSymbol} of an current instance * - * @return + * @return iterable of Terms (reflection API) */ def termSymbolsList = scala.reflect.runtime.currentMirror.classSymbol(this.getClass).toType .members.collect { case m: ru.TermSymbol if m.isGetter => m }.map(_.asTerm) /** + * Assistant function that return value of a {@link TermSymbol} * - * @param termString - * @return + * @param termString TermSymbol to evaluate + * @return value of TermSymbol */ def reflectedFieldValue(termString: ru.TermSymbol) = ru.runtimeMirror(this.getClass.getClassLoader) .reflect(this).reflectField(termString).get /** + * Convert child instance parameters in Java {HashMap[String, String]}. + * This function is usually called from the 'buildFunction' method overwritten in the child classes. * - * @return + * @return Java {HashMap[String, String]} with key(parameterName)->value(parameterValue) */ def mapParsed = new util.HashMap[String, String](termSymbolsList.collect({case tm: ru.TermSymbol if reflectedFieldValue(tm) != null => tm}) .map(x => (x.name.toString, reflectedFieldValue(x).toString)).toMap.asJava) diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/analyzer/CustomAnalyzerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/analyzer/CustomAnalyzerBuilder.scala index f55f36e46..90a442be9 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/analyzer/CustomAnalyzerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/analyzer/CustomAnalyzerBuilder.scala @@ -28,15 +28,22 @@ import org.apache.lucene.analysis.util.{CharFilterFactory, TokenFilterFactory, T import scala.util.{Failure, Success, Try} - /** - * Created by jpgilaberte on 24/05/17. + * {@link AnalyzerBuilder} for building {@link Analyzer}s in classpath using its default constructor. + * + * @author by jpgilaberte on 24/05/17 + * + * @param tokenizer the tokenizer to use. + * @param charFilter the charFilter array to use. + * @param tokenFilter the tokenFilter array to use. */ final case class CustomAnalyzerBuilder( @JsonProperty("tokenizer") tokenizer: TokenizerBuilder[_], @JsonProperty("char_filter") charFilter: Array[CharFilterBuilder[_]], @JsonProperty("token_filter") tokenFilter: Array[TokenFilterBuilder[_]] ) extends AnalyzerBuilder{ + + /** {@inheritDoc} */ override def analyzer(): Analyzer = { val custom = CustomAnalyzer.builder() diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilder.scala index 4e6f3e9e8..cd806fa20 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilder.scala @@ -22,7 +22,12 @@ import org.apache.lucene.analysis.util.CharFilterFactory /** - * Base utility class for implementing a {@link BaseCharFilter}. + * {@link Builder} for building {@link CharFilterBuilder}s in classpath using its default constructor. + * + * Encapsulates all functionality to build Lucene CharFilter. Override 'buildFunction', in Builder trait, + * to implement the construction of a type of Lucene CharFilterFactory with its parameters and its name + * + * @param typeBuilder name of factory in Lucene API * * @author Juan Pedro Gilaberte jpgilaberte@stratio.com */ diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala index edab7c71c..79629b060 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala @@ -22,7 +22,12 @@ import org.apache.lucene.analysis.util.TokenFilterFactory /** - * Base utility class for implementing a {@link BaseCharFilter}. + * {@link Builder} for building {@link TokenFilterBuilder}s in classpath using its default constructor. + * + * Encapsulates all functionality to build Lucene TokenFilter. Override 'buildFunction', in Builder trait, + * to implement the construction of a type of Lucene TokenFilterFactory with its parameters and its name + * + * @param typeBuilder name of factory in Lucene API * * @author Juan Pedro Gilaberte jpgilaberte@stratio.com */ diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala index f652458ac..739c7fd02 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenizer/TokenizerBuilder.scala @@ -22,6 +22,13 @@ import org.apache.lucene.analysis.util.TokenizerFactory /** + * {@link Builder} for building {@link TokenizerBuilder}s in classpath using its default constructor. + * + * Encapsulates all functionality to build Lucene Tokenizer. Override 'buildFunction', in Builder trait, + * to implement the construction of a type of Lucene TokenizerFactory with its parameters and its name + * + * @param typeBuilder name of factory in Lucene API + * * @author Juan Pedro Gilaberte jpgilaberte@stratio.com */ @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type") @@ -39,6 +46,7 @@ import org.apache.lucene.analysis.util.TokenizerFactory new Type(value = classOf[WhitespaceTokenizerBuilder], name = "whitespace"), new Type(value = classOf[WikipediaTokenizerBuilder], name = "wikipedia")) ) sealed abstract class TokenizerBuilder[T](typeBuilder: String) extends Builder[T]{ + /** {@inheritDoc} */ def buildFunction = () => TokenizerFactory.forName(typeBuilder, mapParsed).asInstanceOf[T] } From a1d5f0fa66e8c8892c2ae05811085c494c592e51 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Mon, 12 Jun 2017 11:57:01 +0200 Subject: [PATCH 18/40] Add TokenFilter documentation --- doc/documentation.rst | 185 ++++++++++++++++-- .../charFilter/CharFilterBuilder.scala | 4 +- .../tokenFilter/TokenFilterBuilder.scala | 9 +- 3 files changed, 182 insertions(+), 16 deletions(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index 009e1d3ad..e271105f7 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -18,6 +18,7 @@ Stratio's Cassandra Lucene Index - `Virtual node partitioner <#virtual-node-partitioner>`__ - `Analyzers <#analyzers>`__ - `Classpath analyzer <#classpath-analyzer>`__ + - `Custom analyzer <#custom-analyzer>`__ - `Snowball analyzer <#snowball-analyzer>`__ - `Mappers <#mappers>`__ - `Big decimal mapper <#big-decimal-mapper>`__ @@ -770,15 +771,21 @@ Analyzers Analyzer definition options depend on the analyzer type. Details and default values are listed in the table below. -+-----------------+-------------+--------------+-----------------+ -| Analyzer type | Option | Value type | Default value | -+=================+=============+==============+=================+ -| classpath | class | string | null | -+-----------------+-------------+--------------+-----------------+ -| snowball | language | string | null | -| +-------------+--------------+-----------------+ -| | stopwords | string | null | -+-----------------+-------------+--------------+-----------------+ ++-----------------+--------------+--------------+-----------------+ +| Analyzer type | Option | Value type | Default value | ++=================+==============+==============+=================+ +| classpath | class | string | null | ++-----------------+--------------+--------------+-----------------+ +| custom | tokenizer | Tokenizer | - | +| +--------------+--------------+-----------------+ +| | charFilter[] | CharFilter[] | null | +| +--------------+--------------+-----------------+ +| | tokenFilter[]| TokenFilter[]| null | ++-----------------+--------------+--------------+-----------------+ +| snowball | language | string | null | +| +--------------+--------------+-----------------+ +| | stopwords | string | null | ++-----------------+--------------+--------------+-----------------+ The analyzers defined in this section can by referenced by mappers. Additionally, there are prebuilt analyzers for: @@ -865,7 +872,7 @@ The analyzers defined in this section can by referenced by mappers. Additionally Classpath analyzer __________________ -Analyzer which instances a Lucene's `analyzer `__ +Analyzer which instances a Lucene's `analyzer `__ present in classpath. **Example:** @@ -890,7 +897,7 @@ Snowball analyzer _________________ Analyzer using a `http://snowball.tartarus.org/ `__ snowball filter -`SnowballFilter `__ +`SnowballFilter `__ Example: ~~~~~~~~ @@ -914,6 +921,162 @@ Example: Supported languages: English, French, Spanish, Portuguese, Italian, Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Hungarian and Turkish. +Custom analyzer +_______________ +Analyzer which instances a Lucene's `analyzer `__ +present in classpath. It is a general-purpose Analyzer that can have zero or more CharFilters, at least one Tokenizer and zero or more TokenFilter. +Under the hood it uses the Lucene's factory classes TokenizerFactory, TokenFilterFactory, and CharFilterFactory. + +**Example:** + +.. code-block:: sql + + CREATE CUSTOM INDEX census_index on census() + USING 'com.stratio.cassandra.lucene.Index' + WITH OPTIONS = { + 'refresh_seconds': '1', + 'schema': '{ + analyzers: { + an_analyzer: { + type: "custom", + tokenizer: {type:"whitespace"}, + token_filter: [{type:"asciifolding"}, {type:"lowercase"}] + } + } + }' + }; + +CharFilter +"""""""""" + ++-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| CharFilter name | Option | Value type | Default value | Mandatory | CharFilter full package name | ++=============================+==============+==============+================+============+==========================================================================================================================================================================================+ +| pattern | pattern | string | null | No | `org.apache.lucene.analysis.pattern.PatternReplaceCharFilter `__ | ++-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| htmlstrip | escapedtags | string[] | null | No | `org.apache.lucene.analysis.charfilter.HTMLStripCharFilter `__ | ++-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| mapping | mapping | string | null | No | `org.apache.lucene.analysis.charfilter.MappingCharFilter `__ | ++-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| persian | | | | No | `org.apache.lucene.analysis.fa.PersianCharFilter `__ | ++-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +Tokenizer +""""""""" ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| CharFilter name | Option | Value type | Default value | Mandatory | CharFilter full package name | ++=============================+==============+==============+================+============+===============================================================================================================================================================================================+ +| classic | max_token_length | integer | 256 | No | `org.apache.lucene.analysis.standard.ClassicTokenizer `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| edge_ngram | min_gram_size | integer | 1 | No | `org.apache.lucene.analysis.ngram.EdgeNGramTokenizer `__ | +| +-------------------+--------------+----------------+------------+ | +| | max_gram_size | integer | 2 | No | | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| keyword | | | | | `org.apache.lucene.analysis.core.KeywordTokenizer `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| letter | | | | | `org.apache.lucene.analysis.core.LetterTokenizer `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| lower_case | | | | | `org.apache.lucene.analysis.core.LowerCaseTokenizer `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ngram | min_gram_size | integer | 1 | No | `org.apache.lucene.analysis.ngram.NGramTokenizer `__ | +| +-------------------+--------------+----------------+------------+ | +| | max_gram_size | integer | 2 | No | | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| path_hierarchy | reverse | boolean | false | No | `org.apache.lucene.analysis.path.PathHierarchyTokenizer `__ | +| +-------------------+--------------+----------------+------------+ | +| | delimiter | char | / | No | | +| +-------------------+--------------+----------------+------------+ | +| | replace | char | / | No | | +| +-------------------+--------------+----------------+------------+ | +| | skip | integer | 0 | No | | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| pattern | pattern | string | null | No | `org.apache.lucene.analysis.pattern.PatternTokenizer `__ | +| +-------------------+--------------+----------------+------------+ | +| | group | integer | -1 | No | | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| standard | max_token_length | integer | 256 | No | `org.apache.lucene.analysis.standard.StandardTokenizer `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| uax29_url_email | max_token_length | integer | 256 | No | `org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| thai | | | | | `org.apache.lucene.analysis.th.ThaiTokenizer `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| whitespace | rule | string | java | No | `org.apache.lucene.analysis.core.WhitespaceTokenizer `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| wikipedia | | | | | `org.apache.lucene.analysis.wikipedia.WikipediaTokenizer `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +TokenFilter +""""""""""" ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| CharFilter name | Option | Value type | Default value | Mandatory | CharFilter full package name | ++=============================+==============+==============+================+============+=============================================================================================================================================================================================================+ +| standard | | | | | `org.apache.lucene.analysis.standard.StandardFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| apostrophe | | | | | `org.apache.lucene.analysis.tr.ApostropheFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| arabicnormalization | | | | | `org.apache.lucene.analysis.ar.ArabicNormalizationFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| arabicstem | | | | | `org.apache.lucene.analysis.ar.ArabicStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| indicnormalization | | | | | `org.apache.lucene.analysis.in.IndicNormalizationFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| portuguesestem | | | | | `org.apache.lucene.analysis.pt.PortugueseStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| germanminimalstem | | | | | `org/apache/lucene/analysis/de/GermanMinimalStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| uppercase | | | | | `org.apache.lucene.analysis.core.UpperCaseFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| keywordrepeat | | | | | `org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| classic | | | | | `org.apache.lucene.analysis.standard.ClassicFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| shingle | min_shingle_size | integer | 2 | No | `org.apache.lucene.analysis.shingle.ShingleFilter `__ | +| +-------------------+--------------+----------------+------------+ | +| | max_shingle_size | integer | 2 | No | | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| stemmeroverride | dictionary | string | | No | `org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter `__ | +| +-------------------+--------------+----------------+------------+ | +| | ignore_case | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| bulgarianstem | | | | | `org.apache.lucene.analysis.bg.BulgarianStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| swedishlightstem | | | | | `org.apache.lucene.analysis.sv.SwedishLightStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| frenchlightstem | | | | | `org.apache.lucene.analysis.fr.FrenchLightStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| cjkwidth | | | | | `org.apache.lucene.analysis.cjk.CJKWidthFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| greekstem | | | | | `org.apache.lucene.analysis.el.GreekStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| stop | words | string | | No | `org.apache.lucene.analysis.core.StopFilter `__ | +| +-------------------+--------------+----------------+------------+ | +| | format | string | | No | | +| +-------------------+--------------+----------------+------------+ | +| | ignore_case | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hindistem | | | | | `org.apache.lucene.analysis.hi.HindiStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| fingerprint | maxOutputTokenSize| integer | 1024 | No | `org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory `__ | +| +-------------------+--------------+----------------+------------+ | +| | separator | char | " " | No | | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| spanishlightstem | | | | | `org.apache.lucene.analysis.es.SpanishLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hungarianlightstem | | | | | `org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| norwegianminimalstem | | | | | `org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| persiannormalization | | | | | `org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| germanlightstem | | | | | `org.apache.lucene.analysis.de.GermanLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| type | | | | | `org.apache.lucene.analysis.core.TypeTokenFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| asciifolding | preserveOriginal | boolean | false | No | `org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| lowercase | | | | | `org.apache.lucene.analysis.el.GreekLowerCaseFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + Mappers ======= diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilder.scala index cd806fa20..ad8565a00 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/charFilter/CharFilterBuilder.scala @@ -37,10 +37,11 @@ import org.apache.lucene.analysis.util.CharFilterFactory new Type(value = classOf[PersianCharFilterBuilder], name = CharFilterBuilder.PERSIAN), new Type(value = classOf[PatternReplaceCharFilterBuilder], name = CharFilterBuilder.PATTERN_REPLACE))) sealed abstract class CharFilterBuilder[T](typeBuilder: String) extends Builder[T]{ + /** {@inheritDoc} */ def buildFunction = () => CharFilterFactory.forName(typeBuilder, mapParsed).asInstanceOf[T] } -final case class HtmlStripCharFilterBuilder() extends CharFilterBuilder[CharFilterFactory](CharFilterBuilder.HTML_STRIP) +final case class HtmlStripCharFilterBuilder(@JsonProperty(CharFilterBuilder.ESCAPED_TAGS) escapedTags: Array[String]) extends CharFilterBuilder[CharFilterFactory](CharFilterBuilder.HTML_STRIP) final case class PersianCharFilterBuilder() extends CharFilterBuilder[CharFilterFactory](CharFilterBuilder.PERSIAN) final case class PatternReplaceCharFilterBuilder(@JsonProperty(CharFilterBuilder.PATTERN) pattern: String, @JsonProperty(CharFilterBuilder.REPLACEMENT) replacement:String) extends CharFilterBuilder[CharFilterFactory](CharFilterBuilder.PATTERN_REPLACE) final case class MappingCharFilterBuilder(@JsonProperty(CharFilterBuilder.MAPPINGS) mapping: String) extends CharFilterBuilder[CharFilterFactory](CharFilterBuilder.MAPPING){ @@ -48,6 +49,7 @@ final case class MappingCharFilterBuilder(@JsonProperty(CharFilterBuilder.MAPPIN } object CharFilterBuilder{ + final val ESCAPED_TAGS = "escapedtags" final val MAPPINGS = "mapping" final val TYPE = "type" final val PATTERN = "pattern" diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala index 79629b060..7ee256d0b 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala @@ -63,6 +63,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory new Type(value = classOf[LowercaseTokenFilter], name = "lowercase") )) sealed abstract class TokenFilterBuilder[T](typeBuilder: String) extends Builder[T]{ + /** {@inheritDoc} */ def buildFunction = () => TokenFilterFactory.forName(typeBuilder, mapParsed).asInstanceOf[T] } @@ -77,15 +78,15 @@ final case class GermanMinimalStemTokenFilterBuilder() extends TokenFilterBuilde final case class UpperCaseTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("uppercase") final case class KeywordRepeatTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("keywordrepeat") final case class ClassicTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("classic") -final case class ShingleTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("shingle") -final case class StemmeroverrideTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("stemmeroverride") +final case class ShingleTokenFilterBuilder(@JsonProperty("min_shingle_size") minShingleSize: Integer, @JsonProperty("max_shingle_size") maxShingleSize: Integer) extends TokenFilterBuilder[TokenFilterFactory]("shingle") +final case class StemmeroverrideTokenFilterBuilder(@JsonProperty("dictionary") dictionary: String, @JsonProperty("ignore_case") ignoreCase: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("stemmeroverride") final case class BulgarianstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("bulgarianstem") final case class SwedishlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("swedishlightstem") final case class FrenchlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("frenchlightstem") final case class CjkwidthTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("cjkwidth") final case class GreekstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("greekstem") final case class StopTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("stop") -final case class HindistemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("hindistem") +final case class HindistemTokenFilterBuilder(@JsonProperty("maxOutputTokenSize") maxOutputTokenSize: Integer, @JsonProperty("separator") separator: String) extends TokenFilterBuilder[TokenFilterFactory]("hindistem") final case class FingerprintTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("fingerprint") final case class SpanishlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("spanishlightstem") final case class HungarianlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("hungarianlightstem") @@ -94,7 +95,7 @@ final case class PersiannormalizationTokenFilterBuilder() extends TokenFilterBui final case class GermanlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("germanlightstem") final case class TypeTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("type") // -final case class AsciifoldingTokenFilter(@JsonProperty("preserveOriginal") preserveOriginal:Boolean) extends TokenFilterBuilder[TokenFilterFactory]("asciifolding") +final case class AsciifoldingTokenFilter(@JsonProperty("preserve_original") preserveOriginal:Boolean) extends TokenFilterBuilder[TokenFilterFactory]("asciifolding") final case class LowercaseTokenFilter() extends TokenFilterBuilder[TokenFilterFactory]("lowercase") // //final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("germanstem") From 3532cab20b32b7867f44e8a323743954c68b62cc Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Mon, 12 Jun 2017 12:28:13 +0200 Subject: [PATCH 19/40] Fix RST format --- doc/documentation.rst | 67 ++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index e271105f7..af85aa84f 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -19,6 +19,9 @@ Stratio's Cassandra Lucene Index - `Analyzers <#analyzers>`__ - `Classpath analyzer <#classpath-analyzer>`__ - `Custom analyzer <#custom-analyzer>`__ + - `CharFilter <#charfilter>`__ + - `Tokenizer <#tokenizer>`__ + - `TokenFilter <#tokenfilter>`__ - `Snowball analyzer <#snowball-analyzer>`__ - `Mappers <#mappers>`__ - `Big decimal mapper <#big-decimal-mapper>`__ @@ -107,11 +110,11 @@ based implementation of Cassandra secondary indexes, where each node of the clus Cassandra indexes are one of the core modules on which `Stratio’s BigData platform `__ is based. .. image:: /doc/resources/architecture.png - :width: 100% +:width: 100% :alt: architecture - :align: center + :align: center -Index `relevance searches `__ allow you to retrieve the + Index `relevance searches `__ allow you to retrieve the *n* more relevant results satisfying a search. The coordinator node sends the search to each node in the cluster, each node returns its *n* best results and then the coordinator combines these partial results and gives you the *n* best of them, avoiding full scan. You can also base the sorting in a combination of fields. @@ -124,18 +127,18 @@ frameworks as `Apache Hadoop `__ or, even better, `Ap Adding Lucene filters in the jobs input can dramatically reduce the amount of data to be processed, avoiding full scan. .. image:: /doc/resources/spark_architecture.png - :width: 100% +:width: 100% :alt: spark_architecture - :align: center + :align: center -This project is not intended to replace Apache Cassandra denormalized tables, inverted indexes, and/or secondary -indexes. It is just a tool to perform some kind of queries which are really hard to be addressed using Apache Cassandra -out of the box features, filling the gap between real-time and analytics. + This project is not intended to replace Apache Cassandra denormalized tables, inverted indexes, and/or secondary + indexes. It is just a tool to perform some kind of queries which are really hard to be addressed using Apache Cassandra + out of the box features, filling the gap between real-time and analytics. .. image:: /doc/resources/oltp_olap.png - :width: 100% +:width: 100% :alt: oltp_olap - :align: center + :align: center Features ======== @@ -776,11 +779,11 @@ default values are listed in the table below. +=================+==============+==============+=================+ | classpath | class | string | null | +-----------------+--------------+--------------+-----------------+ -| custom | tokenizer | Tokenizer | - | +| custom | tokenizer | Tokenizer | | | +--------------+--------------+-----------------+ -| | charFilter[] | CharFilter[] | null | +| | charFilter | CharFilter | null | | +--------------+--------------+-----------------+ -| | tokenFilter[]| TokenFilter[]| null | +| | tokenFilter | TokenFilter | null | +-----------------+--------------+--------------+-----------------+ | snowball | language | string | null | | +--------------+--------------+-----------------+ @@ -947,7 +950,7 @@ Under the hood it uses the Lucene's factory classes TokenizerFactory, TokenFilte }; CharFilter -"""""""""" +========== +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | CharFilter name | Option | Value type | Default value | Mandatory | CharFilter full package name | @@ -962,7 +965,7 @@ CharFilter +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Tokenizer -""""""""" +========= +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | CharFilter name | Option | Value type | Default value | Mandatory | CharFilter full package name | +=============================+==============+==============+================+============+===============================================================================================================================================================================================+ @@ -1006,7 +1009,7 @@ Tokenizer +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ TokenFilter -""""""""""" +=========== +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | CharFilter name | Option | Value type | Default value | Mandatory | CharFilter full package name | +=============================+==============+==============+================+============+=============================================================================================================================================================================================================+ @@ -1685,9 +1688,9 @@ into your Cassandra installation lib directory. **Example 2:** Index only the centroid of the WKT shape contained in the indexed column: .. image:: /doc/resources/geo_shape_mapper_example_2.png - :width: 100% +:width: 100% :alt: search by shape - :align: center + :align: center .. code-block:: sql @@ -1719,9 +1722,9 @@ into your Cassandra installation lib directory. **Example 3:** Index a buffer 50 kilometres around the area of a city: .. image:: /doc/resources/geo_shape_mapper_example_3.png - :width: 100% +:width: 100% :alt: search by shape - :align: center + :align: center .. code-block:: sql @@ -1753,9 +1756,9 @@ into your Cassandra installation lib directory. **Example 4:** Index a buffer 50 kilometres around the borders of a country: .. image:: /doc/resources/geo_shape_mapper_example_4.png - :width: 100% +:width: 100% :alt: search by shape - :align: center + :align: center .. code-block:: sql @@ -1786,9 +1789,9 @@ into your Cassandra installation lib directory. **Example 5:** Index the convex hull of the WKT shape contained in the indexed column: .. image:: /doc/resources/geo_shape_mapper_example_5.png - :width: 100% +:width: 100% :alt: search by shape - :align: center + :align: center .. code-block:: sql @@ -1817,9 +1820,9 @@ into your Cassandra installation lib directory. **Example 6:** Index the bounding box of the WKT shape contained in the indexed column: .. image:: /doc/resources/geo_shape_mapper_example_6.png - :width: 100% +:width: 100% :alt: search by shape - :align: center + :align: center .. code-block:: sql @@ -3330,9 +3333,9 @@ where: **Example 1:** search for shapes within a polygon: .. image:: /doc/resources/geo_shape_condition_example_1.png - :width: 100% +:width: 100% :alt: search by shape - :align: center + :align: center .. code-block:: sql @@ -3362,9 +3365,9 @@ Using the `Java query builder <#query-builder>`__: Florida's coastline: .. image:: /doc/resources/geo_shape_condition_example_2.png - :width: 100% +:width: 100% :alt: buffer transformation - :align: center + :align: center .. code-block:: sql @@ -4772,9 +4775,9 @@ approaches depends on the particular use case. Generally, combining Lucene index retrieving no more than the 25% of the stored data. .. image:: /doc/resources/spark_performance.png - :width: 100% +:width: 100% :alt: spark_performance - :align: center + :align: center ------------- JMX Interface From af386ef783aae3cac830625d9dc4848bdbcc5e80 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Mon, 12 Jun 2017 12:33:12 +0200 Subject: [PATCH 20/40] Fix RST format --- doc/documentation.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index af85aa84f..284724015 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -950,7 +950,7 @@ Under the hood it uses the Lucene's factory classes TokenizerFactory, TokenFilte }; CharFilter -========== +~~~~~~~~~~ +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | CharFilter name | Option | Value type | Default value | Mandatory | CharFilter full package name | @@ -965,10 +965,10 @@ CharFilter +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Tokenizer -========= +~~~~~~~~~ +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | CharFilter name | Option | Value type | Default value | Mandatory | CharFilter full package name | -+=============================+==============+==============+================+============+===============================================================================================================================================================================================+ ++=============================+===================+==============+================+============+==========================================================================================================================================================================================+ | classic | max_token_length | integer | 256 | No | `org.apache.lucene.analysis.standard.ClassicTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | edge_ngram | min_gram_size | integer | 1 | No | `org.apache.lucene.analysis.ngram.EdgeNGramTokenizer `__ | @@ -1009,10 +1009,10 @@ Tokenizer +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ TokenFilter -=========== +~~~~~~~~~~~ +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | CharFilter name | Option | Value type | Default value | Mandatory | CharFilter full package name | -+=============================+==============+==============+================+============+=============================================================================================================================================================================================================+ ++=============================+===================+==============+================+============+========================================================================================================================================================================================================+ | standard | | | | | `org.apache.lucene.analysis.standard.StandardFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | apostrophe | | | | | `org.apache.lucene.analysis.tr.ApostropheFilter `__ | From c63b55371e79b48b8d79ac0ae6c397721f33f4a9 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Mon, 12 Jun 2017 12:36:35 +0200 Subject: [PATCH 21/40] Fix RST format --- doc/documentation.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index 284724015..710720514 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -781,9 +781,9 @@ default values are listed in the table below. +-----------------+--------------+--------------+-----------------+ | custom | tokenizer | Tokenizer | | | +--------------+--------------+-----------------+ -| | charFilter | CharFilter | null | +| | charFilter | CharFilter[] | null | | +--------------+--------------+-----------------+ -| | tokenFilter | TokenFilter | null | +| | tokenFilter | TokenFilter[]| null | +-----------------+--------------+--------------+-----------------+ | snowball | language | string | null | | +--------------+--------------+-----------------+ @@ -953,7 +953,7 @@ CharFilter ~~~~~~~~~~ +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| CharFilter name | Option | Value type | Default value | Mandatory | CharFilter full package name | +| Name | Option | Value type | Default value | Mandatory | CharFilter full package name | +=============================+==============+==============+================+============+==========================================================================================================================================================================================+ | pattern | pattern | string | null | No | `org.apache.lucene.analysis.pattern.PatternReplaceCharFilter `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -967,7 +967,7 @@ CharFilter Tokenizer ~~~~~~~~~ +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| CharFilter name | Option | Value type | Default value | Mandatory | CharFilter full package name | +| Name | Option | Value type | Default value | Mandatory | Tokenizer full package name | +=============================+===================+==============+================+============+==========================================================================================================================================================================================+ | classic | max_token_length | integer | 256 | No | `org.apache.lucene.analysis.standard.ClassicTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -1011,7 +1011,7 @@ Tokenizer TokenFilter ~~~~~~~~~~~ +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| CharFilter name | Option | Value type | Default value | Mandatory | CharFilter full package name | +| Name | Option | Value type | Default value | Mandatory | TokenFilter full package name | +=============================+===================+==============+================+============+========================================================================================================================================================================================================+ | standard | | | | | `org.apache.lucene.analysis.standard.StandardFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ From b16b9fcf38e2423a87bc2f4705b01bd44df07f7f Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Mon, 12 Jun 2017 13:59:23 +0200 Subject: [PATCH 22/40] Fix package name format --- doc/documentation.rst | 96 +++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index 710720514..730f94478 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -953,39 +953,39 @@ CharFilter ~~~~~~~~~~ +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Name | Option | Value type | Default value | Mandatory | CharFilter full package name | +| Name | Option | Value type | Default value | Mandatory | CharFilter class name | +=============================+==============+==============+================+============+==========================================================================================================================================================================================+ -| pattern | pattern | string | null | No | `org.apache.lucene.analysis.pattern.PatternReplaceCharFilter `__ | +| pattern | pattern | string | null | No | `PatternReplaceCharFilter `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| htmlstrip | escapedtags | string[] | null | No | `org.apache.lucene.analysis.charfilter.HTMLStripCharFilter `__ | +| htmlstrip | escapedtags | string[] | null | No | `HTMLStripCharFilter `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| mapping | mapping | string | null | No | `org.apache.lucene.analysis.charfilter.MappingCharFilter `__ | +| mapping | mapping | string | null | No | `MappingCharFilter `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| persian | | | | No | `org.apache.lucene.analysis.fa.PersianCharFilter `__ | +| persian | | | | No | `PersianCharFilter `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Tokenizer ~~~~~~~~~ +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Name | Option | Value type | Default value | Mandatory | Tokenizer full package name | +| Name | Option | Value type | Default value | Mandatory | Tokenizer class name | +=============================+===================+==============+================+============+==========================================================================================================================================================================================+ -| classic | max_token_length | integer | 256 | No | `org.apache.lucene.analysis.standard.ClassicTokenizer `__ | +| classic | max_token_length | integer | 256 | No | `ClassicTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| edge_ngram | min_gram_size | integer | 1 | No | `org.apache.lucene.analysis.ngram.EdgeNGramTokenizer `__ | +| edge_ngram | min_gram_size | integer | 1 | No | `EdgeNGramTokenizer `__ | | +-------------------+--------------+----------------+------------+ | | | max_gram_size | integer | 2 | No | | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| keyword | | | | | `org.apache.lucene.analysis.core.KeywordTokenizer `__ | +| keyword | | | | | `KeywordTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| letter | | | | | `org.apache.lucene.analysis.core.LetterTokenizer `__ | +| letter | | | | | `LetterTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| lower_case | | | | | `org.apache.lucene.analysis.core.LowerCaseTokenizer `__ | +| lower_case | | | | | `LowerCaseTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ngram | min_gram_size | integer | 1 | No | `org.apache.lucene.analysis.ngram.NGramTokenizer `__ | +| ngram | min_gram_size | integer | 1 | No | `NGramTokenizer `__ | | +-------------------+--------------+----------------+------------+ | | | max_gram_size | integer | 2 | No | | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| path_hierarchy | reverse | boolean | false | No | `org.apache.lucene.analysis.path.PathHierarchyTokenizer `__ | +| path_hierarchy | reverse | boolean | false | No | `PathHierarchyTokenizer `__ | | +-------------------+--------------+----------------+------------+ | | | delimiter | char | / | No | | | +-------------------+--------------+----------------+------------+ | @@ -993,91 +993,91 @@ Tokenizer | +-------------------+--------------+----------------+------------+ | | | skip | integer | 0 | No | | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| pattern | pattern | string | null | No | `org.apache.lucene.analysis.pattern.PatternTokenizer `__ | +| pattern | pattern | string | null | No | `PatternTokenizer `__ | | +-------------------+--------------+----------------+------------+ | | | group | integer | -1 | No | | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| standard | max_token_length | integer | 256 | No | `org.apache.lucene.analysis.standard.StandardTokenizer `__ | +| standard | max_token_length | integer | 256 | No | `StandardTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| uax29_url_email | max_token_length | integer | 256 | No | `org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer `__ | +| uax29_url_email | max_token_length | integer | 256 | No | `UAX29URLEmailTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| thai | | | | | `org.apache.lucene.analysis.th.ThaiTokenizer `__ | +| thai | | | | | `ThaiTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| whitespace | rule | string | java | No | `org.apache.lucene.analysis.core.WhitespaceTokenizer `__ | +| whitespace | rule | string | java | No | `WhitespaceTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| wikipedia | | | | | `org.apache.lucene.analysis.wikipedia.WikipediaTokenizer `__ | +| wikipedia | | | | | `WikipediaTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ TokenFilter ~~~~~~~~~~~ +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Name | Option | Value type | Default value | Mandatory | TokenFilter full package name | +| Name | Option | Value type | Default value | Mandatory | TokenFilter class name | +=============================+===================+==============+================+============+========================================================================================================================================================================================================+ -| standard | | | | | `org.apache.lucene.analysis.standard.StandardFilter `__ | +| standard | | | | | `StandardFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| apostrophe | | | | | `org.apache.lucene.analysis.tr.ApostropheFilter `__ | +| apostrophe | | | | | `ApostropheFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| arabicnormalization | | | | | `org.apache.lucene.analysis.ar.ArabicNormalizationFilter `__ | +| arabicnormalization | | | | | `ArabicNormalizationFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| arabicstem | | | | | `org.apache.lucene.analysis.ar.ArabicStemFilter `__ | +| arabicstem | | | | | `ArabicStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| indicnormalization | | | | | `org.apache.lucene.analysis.in.IndicNormalizationFilter `__ | +| indicnormalization | | | | | `IndicNormalizationFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| portuguesestem | | | | | `org.apache.lucene.analysis.pt.PortugueseStemFilter `__ | +| portuguesestem | | | | | `PortugueseStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| germanminimalstem | | | | | `org/apache/lucene/analysis/de/GermanMinimalStemFilter `__ | +| germanminimalstem | | | | | `GermanMinimalStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| uppercase | | | | | `org.apache.lucene.analysis.core.UpperCaseFilter `__ | +| uppercase | | | | | `UpperCaseFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| keywordrepeat | | | | | `org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter `__ | +| keywordrepeat | | | | | `KeywordRepeatFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| classic | | | | | `org.apache.lucene.analysis.standard.ClassicFilter `__ | +| classic | | | | | `ClassicFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| shingle | min_shingle_size | integer | 2 | No | `org.apache.lucene.analysis.shingle.ShingleFilter `__ | +| shingle | min_shingle_size | integer | 2 | No | `ShingleFilter `__ | | +-------------------+--------------+----------------+------------+ | | | max_shingle_size | integer | 2 | No | | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| stemmeroverride | dictionary | string | | No | `org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter `__ | +| stemmeroverride | dictionary | string | | No | `StemmerOverrideFilter `__ | | +-------------------+--------------+----------------+------------+ | | | ignore_case | boolean | false | No | | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| bulgarianstem | | | | | `org.apache.lucene.analysis.bg.BulgarianStemFilter `__ | +| bulgarianstem | | | | | `BulgarianStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| swedishlightstem | | | | | `org.apache.lucene.analysis.sv.SwedishLightStemFilter `__ | +| swedishlightstem | | | | | `SwedishLightStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| frenchlightstem | | | | | `org.apache.lucene.analysis.fr.FrenchLightStemFilter `__ | +| frenchlightstem | | | | | `FrenchLightStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| cjkwidth | | | | | `org.apache.lucene.analysis.cjk.CJKWidthFilter `__ | +| cjkwidth | | | | | `CJKWidthFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| greekstem | | | | | `org.apache.lucene.analysis.el.GreekStemFilter `__ | +| greekstem | | | | | `GreekStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| stop | words | string | | No | `org.apache.lucene.analysis.core.StopFilter `__ | +| stop | words | string | | No | `StopFilter `__ | | +-------------------+--------------+----------------+------------+ | | | format | string | | No | | | +-------------------+--------------+----------------+------------+ | | | ignore_case | boolean | false | No | | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| hindistem | | | | | `org.apache.lucene.analysis.hi.HindiStemFilter `__ | +| hindistem | | | | | `HindiStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| fingerprint | maxOutputTokenSize| integer | 1024 | No | `org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory `__ | +| fingerprint | maxOutputTokenSize| integer | 1024 | No | `FingerprintFilterFactory `__ | | +-------------------+--------------+----------------+------------+ | | | separator | char | " " | No | | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| spanishlightstem | | | | | `org.apache.lucene.analysis.es.SpanishLightStemFilterFactory `__ | +| spanishlightstem | | | | | `SpanishLightStemFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| hungarianlightstem | | | | | `org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory `__ | +| hungarianlightstem | | | | | `HungarianLightStemFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| norwegianminimalstem | | | | | `org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory `__ | +| norwegianminimalstem | | | | | `NorwegianMinimalStemFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| persiannormalization | | | | | `org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory `__ | +| persiannormalization | | | | | `PersianNormalizationFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| germanlightstem | | | | | `org.apache.lucene.analysis.de.GermanLightStemFilterFactory `__ | +| germanlightstem | | | | | `GermanLightStemFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| type | | | | | `org.apache.lucene.analysis.core.TypeTokenFilterFactory `__ | +| type | | | | | `TypeTokenFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| asciifolding | preserveOriginal | boolean | false | No | `org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory `__ | +| asciifolding | preserveOriginal | boolean | false | No | `ASCIIFoldingFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| lowercase | | | | | `org.apache.lucene.analysis.el.GreekLowerCaseFilterFactory `__ | +| lowercase | | | | | `GreekLowerCaseFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Mappers From a4cd8f322e03739c8dfef024f5c0453a8779870f Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Mon, 12 Jun 2017 14:04:29 +0200 Subject: [PATCH 23/40] Fix package name format --- doc/documentation.rst | 96 +++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index 730f94478..ee27f25b0 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -953,39 +953,39 @@ CharFilter ~~~~~~~~~~ +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Name | Option | Value type | Default value | Mandatory | CharFilter class name | +| Name | Option | Value type | Default value | Mandatory | CharFilter class name | +=============================+==============+==============+================+============+==========================================================================================================================================================================================+ -| pattern | pattern | string | null | No | `PatternReplaceCharFilter `__ | +| pattern | pattern | string | null | No | `PatternReplaceCharFilter `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| htmlstrip | escapedtags | string[] | null | No | `HTMLStripCharFilter `__ | +| htmlstrip | escapedtags | string[] | null | No | `HTMLStripCharFilter `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| mapping | mapping | string | null | No | `MappingCharFilter `__ | +| mapping | mapping | string | null | No | `MappingCharFilter `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| persian | | | | No | `PersianCharFilter `__ | +| persian | | | | No | `PersianCharFilter `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Tokenizer ~~~~~~~~~ +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Name | Option | Value type | Default value | Mandatory | Tokenizer class name | +| Name | Option | Value type | Default value | Mandatory | Tokenizer class name | +=============================+===================+==============+================+============+==========================================================================================================================================================================================+ -| classic | max_token_length | integer | 256 | No | `ClassicTokenizer `__ | +| classic | max_token_length | integer | 256 | No | `ClassicTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| edge_ngram | min_gram_size | integer | 1 | No | `EdgeNGramTokenizer `__ | +| edge_ngram | min_gram_size | integer | 1 | No | `EdgeNGramTokenizer `__ | | +-------------------+--------------+----------------+------------+ | | | max_gram_size | integer | 2 | No | | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| keyword | | | | | `KeywordTokenizer `__ | +| keyword | | | | | `KeywordTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| letter | | | | | `LetterTokenizer `__ | +| letter | | | | | `LetterTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| lower_case | | | | | `LowerCaseTokenizer `__ | +| lower_case | | | | | `LowerCaseTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ngram | min_gram_size | integer | 1 | No | `NGramTokenizer `__ | +| ngram | min_gram_size | integer | 1 | No | `NGramTokenizer `__ | | +-------------------+--------------+----------------+------------+ | | | max_gram_size | integer | 2 | No | | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| path_hierarchy | reverse | boolean | false | No | `PathHierarchyTokenizer `__ | +| path_hierarchy | reverse | boolean | false | No | `PathHierarchyTokenizer `__ | | +-------------------+--------------+----------------+------------+ | | | delimiter | char | / | No | | | +-------------------+--------------+----------------+------------+ | @@ -993,91 +993,91 @@ Tokenizer | +-------------------+--------------+----------------+------------+ | | | skip | integer | 0 | No | | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| pattern | pattern | string | null | No | `PatternTokenizer `__ | +| pattern | pattern | string | null | No | `PatternTokenizer `__ | | +-------------------+--------------+----------------+------------+ | | | group | integer | -1 | No | | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| standard | max_token_length | integer | 256 | No | `StandardTokenizer `__ | +| standard | max_token_length | integer | 256 | No | `StandardTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| uax29_url_email | max_token_length | integer | 256 | No | `UAX29URLEmailTokenizer `__ | +| uax29_url_email | max_token_length | integer | 256 | No | `UAX29URLEmailTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| thai | | | | | `ThaiTokenizer `__ | +| thai | | | | | `ThaiTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| whitespace | rule | string | java | No | `WhitespaceTokenizer `__ | +| whitespace | rule | string | java | No | `WhitespaceTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| wikipedia | | | | | `WikipediaTokenizer `__ | +| wikipedia | | | | | `WikipediaTokenizer `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ TokenFilter ~~~~~~~~~~~ +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Name | Option | Value type | Default value | Mandatory | TokenFilter class name | +| Name | Option | Value type | Default value | Mandatory | TokenFilter class name | +=============================+===================+==============+================+============+========================================================================================================================================================================================================+ -| standard | | | | | `StandardFilter `__ | +| standard | | | | | `StandardFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| apostrophe | | | | | `ApostropheFilter `__ | +| apostrophe | | | | | `ApostropheFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| arabicnormalization | | | | | `ArabicNormalizationFilter `__ | +| arabicnormalization | | | | | `ArabicNormalizationFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| arabicstem | | | | | `ArabicStemFilter `__ | +| arabicstem | | | | | `ArabicStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| indicnormalization | | | | | `IndicNormalizationFilter `__ | +| indicnormalization | | | | | `IndicNormalizationFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| portuguesestem | | | | | `PortugueseStemFilter `__ | +| portuguesestem | | | | | `PortugueseStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| germanminimalstem | | | | | `GermanMinimalStemFilter `__ | +| germanminimalstem | | | | | `GermanMinimalStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| uppercase | | | | | `UpperCaseFilter `__ | +| uppercase | | | | | `UpperCaseFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| keywordrepeat | | | | | `KeywordRepeatFilter `__ | +| keywordrepeat | | | | | `KeywordRepeatFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| classic | | | | | `ClassicFilter `__ | +| classic | | | | | `ClassicFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| shingle | min_shingle_size | integer | 2 | No | `ShingleFilter `__ | +| shingle | min_shingle_size | integer | 2 | No | `ShingleFilter `__ | | +-------------------+--------------+----------------+------------+ | | | max_shingle_size | integer | 2 | No | | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| stemmeroverride | dictionary | string | | No | `StemmerOverrideFilter `__ | +| stemmeroverride | dictionary | string | | No | `StemmerOverrideFilter `__ | | +-------------------+--------------+----------------+------------+ | | | ignore_case | boolean | false | No | | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| bulgarianstem | | | | | `BulgarianStemFilter `__ | +| bulgarianstem | | | | | `BulgarianStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| swedishlightstem | | | | | `SwedishLightStemFilter `__ | +| swedishlightstem | | | | | `SwedishLightStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| frenchlightstem | | | | | `FrenchLightStemFilter `__ | +| frenchlightstem | | | | | `FrenchLightStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| cjkwidth | | | | | `CJKWidthFilter `__ | +| cjkwidth | | | | | `CJKWidthFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| greekstem | | | | | `GreekStemFilter `__ | +| greekstem | | | | | `GreekStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| stop | words | string | | No | `StopFilter `__ | +| stop | words | string | | No | `StopFilter `__ | | +-------------------+--------------+----------------+------------+ | | | format | string | | No | | | +-------------------+--------------+----------------+------------+ | | | ignore_case | boolean | false | No | | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| hindistem | | | | | `HindiStemFilter `__ | +| hindistem | | | | | `HindiStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| fingerprint | maxOutputTokenSize| integer | 1024 | No | `FingerprintFilterFactory `__ | +| fingerprint | maxOutputTokenSize| integer | 1024 | No | `FingerprintFilterFactory `__ | | +-------------------+--------------+----------------+------------+ | | | separator | char | " " | No | | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| spanishlightstem | | | | | `SpanishLightStemFilterFactory `__ | +| spanishlightstem | | | | | `SpanishLightStemFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| hungarianlightstem | | | | | `HungarianLightStemFilterFactory `__ | +| hungarianlightstem | | | | | `HungarianLightStemFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| norwegianminimalstem | | | | | `NorwegianMinimalStemFilterFactory `__ | +| norwegianminimalstem | | | | | `NorwegianMinimalStemFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| persiannormalization | | | | | `PersianNormalizationFilterFactory `__ | +| persiannormalization | | | | | `PersianNormalizationFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| germanlightstem | | | | | `GermanLightStemFilterFactory `__ | +| germanlightstem | | | | | `GermanLightStemFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| type | | | | | `TypeTokenFilterFactory `__ | +| type | | | | | `TypeTokenFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| asciifolding | preserveOriginal | boolean | false | No | `ASCIIFoldingFilterFactory `__ | +| asciifolding | preserveOriginal | boolean | false | No | `ASCIIFoldingFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| lowercase | | | | | `GreekLowerCaseFilterFactory `__ | +| lowercase | | | | | `GreekLowerCaseFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Mappers From 3822c65a53edbfb13d6813fba674666001088dcf Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Mon, 12 Jun 2017 14:10:30 +0200 Subject: [PATCH 24/40] Fix mandatory column size --- doc/documentation.rst | 138 +++++++++++++++++++++--------------------- 1 file changed, 69 insertions(+), 69 deletions(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index ee27f25b0..4b1d0bb5b 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -1010,75 +1010,75 @@ Tokenizer TokenFilter ~~~~~~~~~~~ -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Name | Option | Value type | Default value | Mandatory | TokenFilter class name | -+=============================+===================+==============+================+============+========================================================================================================================================================================================================+ -| standard | | | | | `StandardFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| apostrophe | | | | | `ApostropheFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| arabicnormalization | | | | | `ArabicNormalizationFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| arabicstem | | | | | `ArabicStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| indicnormalization | | | | | `IndicNormalizationFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| portuguesestem | | | | | `PortugueseStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| germanminimalstem | | | | | `GermanMinimalStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| uppercase | | | | | `UpperCaseFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| keywordrepeat | | | | | `KeywordRepeatFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| classic | | | | | `ClassicFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| shingle | min_shingle_size | integer | 2 | No | `ShingleFilter `__ | -| +-------------------+--------------+----------------+------------+ | -| | max_shingle_size | integer | 2 | No | | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| stemmeroverride | dictionary | string | | No | `StemmerOverrideFilter `__ | -| +-------------------+--------------+----------------+------------+ | -| | ignore_case | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| bulgarianstem | | | | | `BulgarianStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| swedishlightstem | | | | | `SwedishLightStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| frenchlightstem | | | | | `FrenchLightStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| cjkwidth | | | | | `CJKWidthFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| greekstem | | | | | `GreekStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| stop | words | string | | No | `StopFilter `__ | -| +-------------------+--------------+----------------+------------+ | -| | format | string | | No | | -| +-------------------+--------------+----------------+------------+ | -| | ignore_case | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| hindistem | | | | | `HindiStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| fingerprint | maxOutputTokenSize| integer | 1024 | No | `FingerprintFilterFactory `__ | -| +-------------------+--------------+----------------+------------+ | -| | separator | char | " " | No | | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| spanishlightstem | | | | | `SpanishLightStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| hungarianlightstem | | | | | `HungarianLightStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| norwegianminimalstem | | | | | `NorwegianMinimalStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| persiannormalization | | | | | `PersianNormalizationFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| germanlightstem | | | | | `GermanLightStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| type | | | | | `TypeTokenFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| asciifolding | preserveOriginal | boolean | false | No | `ASCIIFoldingFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| lowercase | | | | | `GreekLowerCaseFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Name | Option | Value type | Default value | Mand. | TokenFilter class name | ++=============================+===================+==============+================+=======+========================================================================================================================================================================================================+ +| standard | | | | | `StandardFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| apostrophe | | | | | `ApostropheFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| arabicnormalization | | | | | `ArabicNormalizationFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| arabicstem | | | | | `ArabicStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| indicnormalization | | | | | `IndicNormalizationFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| portuguesestem | | | | | `PortugueseStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| germanminimalstem | | | | | `GermanMinimalStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| uppercase | | | | | `UpperCaseFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| keywordrepeat | | | | | `KeywordRepeatFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| classic | | | | | `ClassicFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| shingle | min_shingle_size | integer | 2 | No | `ShingleFilter `__ | +| +-------------------+--------------+----------------+-------+ | +| | max_shingle_size | integer | 2 | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| stemmeroverride | dictionary | string | | No | `StemmerOverrideFilter `__ | +| +-------------------+--------------+----------------+-------+ | +| | ignore_case | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| bulgarianstem | | | | | `BulgarianStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| swedishlightstem | | | | | `SwedishLightStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| frenchlightstem | | | | | `FrenchLightStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| cjkwidth | | | | | `CJKWidthFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| greekstem | | | | | `GreekStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| stop | words | string | | No | `StopFilter `__ | +| +-------------------+--------------+----------------+-------+ | +| | format | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | ignore_case | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hindistem | | | | | `HindiStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| fingerprint | maxOutputTokenSize| integer | 1024 | No | `FingerprintFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | separator | char | " " | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| spanishlightstem | | | | | `SpanishLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hungarianlightstem | | | | | `HungarianLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| norwegianminimalstem | | | | | `NorwegianMinimalStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| persiannormalization | | | | | `PersianNormalizationFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| germanlightstem | | | | | `GermanLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| type | | | | | `TypeTokenFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| asciifolding | preserveOriginal | boolean | false | No | `ASCIIFoldingFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| lowercase | | | | | `GreekLowerCaseFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Mappers ======= From 5ed61a853e3fd1a59f09003cba1fdf5e3de70987 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Mon, 12 Jun 2017 15:41:28 +0200 Subject: [PATCH 25/40] Fix mandatory column size --- doc/documentation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index 4b1d0bb5b..5e6fff2db 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -1056,7 +1056,7 @@ TokenFilter | | format | string | | No | | | +-------------------+--------------+----------------+-------+ | | | ignore_case | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | hindistem | | | | | `HindiStemFilter `__ | +-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | fingerprint | maxOutputTokenSize| integer | 1024 | No | `FingerprintFilterFactory `__ | From cfa3a20fc85b4b64da7be6037f980897554f864f Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Tue, 13 Jun 2017 15:39:37 +0200 Subject: [PATCH 26/40] Add more TokenFilters --- .../tokenFilter/TokenFilterBuilder.scala | 205 +++++++++++------- 1 file changed, 132 insertions(+), 73 deletions(-) diff --git a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala index 7ee256d0b..d36633d84 100644 --- a/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala +++ b/plugin/src/main/scala/com/stratio/cassandra/lucene/schema/analysis/tokenFilter/TokenFilterBuilder.scala @@ -32,7 +32,8 @@ import org.apache.lucene.analysis.util.TokenFilterFactory * @author Juan Pedro Gilaberte jpgilaberte@stratio.com */ @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type") -@JsonSubTypes(Array(new Type(value = classOf[StandardTokenFilterBuilder], name = "standard"), +@JsonSubTypes(Array( + new Type(value = classOf[StandardTokenFilterBuilder], name = "standard"), new Type(value = classOf[ApostropheTokenFilterBuilder], name = "apostrophe"), new Type(value = classOf[ArabicNormalizationTokenFilterBuilder], name = "arabicnormalization"), new Type(value = classOf[ArabicStemTokenFilterBuilder], name = "arabicstem"), @@ -59,8 +60,69 @@ import org.apache.lucene.analysis.util.TokenFilterFactory new Type(value = classOf[PersiannormalizationTokenFilterBuilder], name = "persiannormalization"), new Type(value = classOf[GermanlightstemTokenFilterBuilder], name = "germanlightstem"), new Type(value = classOf[TypeTokenFilterBuilder], name = "type"), - new Type(value = classOf[AsciifoldingTokenFilter], name = "asciifolding"), - new Type(value = classOf[LowercaseTokenFilter], name = "lowercase") + new Type(value = classOf[GermanstemTokenFilterBuilder], name = "germanstem"), + new Type(value = classOf[NGramTokenFilterBuilder], name = "ngram"), + new Type(value = classOf[LimittokenpositionTokenFilterBuilder], name = "limittokenposition"), + new Type(value = classOf[GreeklowercaseTokenFilterBuilder], name = "greeklowercase"), + new Type(value = classOf[LimittokenoffsetTokenFilterBuilder], name = "limittokenoffset"), + new Type(value = classOf[SnowballporterTokenFilterBuilder], name = "snowballporter"), + new Type(value = classOf[TypeaspayloadTokenFilterBuilder], name = "typeaspayload"), + new Type(value = classOf[PatternreplaceTokenFilterBuilder], name = "patternreplace"), + new Type(value = classOf[CjkbigramTokenFilterBuilder], name = "cjkbigram"), + new Type(value = classOf[KeywordmarkerTokenFilterBuilder], name = "keywordmarker"), + new Type(value = classOf[SoranistemTokenFilterBuilder], name = "soranistem"), + new Type(value = classOf[ElisionTokenFilterBuilder], name = "elision"), + new Type(value = classOf[HunspellstemTokenFilterBuilder], name = "hunspellstem"), + new Type(value = classOf[CodepointcountTokenFilterBuilder], name = "codepointcount"), + new Type(value = classOf[CzechstemTokenFilterBuilder], name = "czechstem"), + new Type(value = classOf[TurkishlowercaseTokenFilterBuilder], name = "turkishlowercase"), + new Type(value = classOf[DaterecognizerTokenFilterBuilder], name = "daterecognizer"), + new Type(value = classOf[PortugueselightstemTokenFilterBuilder], name = "portugueselightstem"), + new Type(value = classOf[IrishlowercaseTokenFilterBuilder], name = "irishlowercase"), + new Type(value = classOf[CommongramsqueryTokenFilterBuilder], name = "commongramsquery"), + new Type(value = classOf[NumericpayloadTokenFilterBuilder], name = "numericpayload"), + new Type(value = classOf[ScandinavianfoldingTokenFilterBuilder], name = "scandinavianfolding"), + new Type(value = classOf[GermannormalizationTokenFilterBuilder], name = "germannormalization"), + new Type(value = classOf[DelimitedpayloadTokenFilterBuilder], name = "delimitedpayload"), + new Type(value = classOf[WorddelimiterTokenFilterBuilder], name = "worddelimiter"), + new Type(value = classOf[PortugueseminimalstemTokenFilterBuilder], name = "portugueseminimalstem"), + new Type(value = classOf[RemoveduplicatesTokenFilterBuilder], name = "removeduplicates"), + new Type(value = classOf[EdgengramTokenFilterBuilder], name = "edgengram"), + new Type(value = classOf[LatvianstemTokenFilterBuilder], name = "latvianstem"), + new Type(value = classOf[PorterstemTokenFilterBuilder], name = "porterstem"), + new Type(value = classOf[FinnishlightstemTokenFilterBuilder], name = "finnishlightstem"), + new Type(value = classOf[CommongramsTokenFilterBuilder], name = "commongrams"), + new Type(value = classOf[GalicianstemTokenFilterBuilder], name = "galicianstem"), + new Type(value = classOf[KstemTokenFilterBuilder], name = "kstem"), + new Type(value = classOf[AsciifoldingTokenFilterBuilder], name = "asciifolding"), + new Type(value = classOf[NorwegianlightstemTokenFilterBuilder], name = "norwegianlightstem"), + new Type(value = classOf[TrimTokenFilterBuilder], name = "trim"), + new Type(value = classOf[LengthTokenFilterBuilder], name = "length"), + new Type(value = classOf[DecimaldigitTokenFilterBuilder], name = "decimaldigit"), + new Type(value = classOf[BrazilianstemTokenFilterBuilder], name = "brazilianstem"), + new Type(value = classOf[CapitalizationTokenFilterBuilder], name = "capitalization"), + new Type(value = classOf[SerbiannormalizationTokenFilterBuilder], name = "serbiannormalization"), + new Type(value = classOf[FrenchminimalstemTokenFilterBuilder], name = "frenchminimalstem"), + new Type(value = classOf[EnglishminimalstemTokenFilterBuilder], name = "englishminimalstem"), + new Type(value = classOf[LimittokencountTokenFilterBuilder], name = "limittokencount"), + new Type(value = classOf[HyphenatedwordsTokenFilterBuilder], name = "hyphenatedwords"), + new Type(value = classOf[TruncateTokenFilterBuilder], name = "truncate"), + new Type(value = classOf[TokenoffsetpayloadTokenFilterBuilder], name = "tokenoffsetpayload"), + new Type(value = classOf[GalicianminimalstemTokenFilterBuilder], name = "galicianminimalstem"), + new Type(value = classOf[RussianlightstemTokenFilterBuilder], name = "russianlightstem"), + new Type(value = classOf[EnglishpossessiveTokenFilterBuilder], name = "englishpossessive"), + new Type(value = classOf[LowercaseTokenFilterBuilder], name = "lowercase"), + new Type(value = classOf[HindinormalizationTokenFilterBuilder], name = "hindinormalization"), + new Type(value = classOf[ScandinaviannormalizationTokenFilterBuilder], name = "scandinaviannormalization"), + new Type(value = classOf[ThaiwordTokenFilterBuilder], name = "thaiword"), + new Type(value = classOf[SynonymTokenFilterBuilder], name = "synonym"), + new Type(value = classOf[IndonesianstemTokenFilterBuilder], name = "indonesianstem"), + new Type(value = classOf[KeepwordTokenFilterBuilder], name = "keepword"), + new Type(value = classOf[HyphenationcompoundwordTokenFilterBuilder], name = "hyphenationcompoundword"), + new Type(value = classOf[DictionarycompoundwordTokenFilterBuilder], name = "dictionarycompoundword"), + new Type(value = classOf[ItalianlightstemTokenFilterBuilder], name = "italianlightstem"), + new Type(value = classOf[PatterncapturegroupTokenFilterBuilder], name = "patterncapturegroup"), + new Type(value = classOf[ReverseStringTokenFilterBuilder], name = "reversestring") )) sealed abstract class TokenFilterBuilder[T](typeBuilder: String) extends Builder[T]{ /** {@inheritDoc} */ @@ -78,7 +140,7 @@ final case class GermanMinimalStemTokenFilterBuilder() extends TokenFilterBuilde final case class UpperCaseTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("uppercase") final case class KeywordRepeatTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("keywordrepeat") final case class ClassicTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("classic") -final case class ShingleTokenFilterBuilder(@JsonProperty("min_shingle_size") minShingleSize: Integer, @JsonProperty("max_shingle_size") maxShingleSize: Integer) extends TokenFilterBuilder[TokenFilterFactory]("shingle") +final case class ShingleTokenFilterBuilder(@JsonProperty("min_shingle_size") minShingleSize: Integer, @JsonProperty("max_shingle_size") maxShingleSize: Integer, @JsonProperty("outputUnigrams") outputUnigrams: Boolean, @JsonProperty("OUIfNoShingles") outputUnigramsIfNoShingles: Boolean, @JsonProperty("tokenSeparator") tokenSeparator: String, @JsonProperty("fillerToken") fillerToken: String) extends TokenFilterBuilder[TokenFilterFactory]("shingle") final case class StemmeroverrideTokenFilterBuilder(@JsonProperty("dictionary") dictionary: String, @JsonProperty("ignore_case") ignoreCase: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("stemmeroverride") final case class BulgarianstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("bulgarianstem") final case class SwedishlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("swedishlightstem") @@ -93,72 +155,69 @@ final case class HungarianlightstemTokenFilterBuilder() extends TokenFilterBuild final case class NorwegianminimalstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("norwegianminimalstem") final case class PersiannormalizationTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("persiannormalization") final case class GermanlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("germanlightstem") -final case class TypeTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("type") -// -final case class AsciifoldingTokenFilter(@JsonProperty("preserve_original") preserveOriginal:Boolean) extends TokenFilterBuilder[TokenFilterFactory]("asciifolding") -final case class LowercaseTokenFilter() extends TokenFilterBuilder[TokenFilterFactory]("lowercase") -// -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("germanstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("ngram") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("limittokenposition") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("greeklowercase") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("standard") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("limittokenoffset") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("snowballporter") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("typeaspayload") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("patternreplace") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("cjkbigram") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("keywordmarker") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("soranistem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("elision") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("hunspellstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("codepointcount") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("czechstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("turkishlowercase") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("daterecognizer") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("portugueselightstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("irishlowercase") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("commongramsquery") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("numericpayload") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("scandinavianfolding") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("germannormalization") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("delimitedpayload") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("worddelimiter") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("portugueseminimalstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("removeduplicates") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("edgengram") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("latvianstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("porterstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("finnishlightstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("commongrams") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("galicianstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("kstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("reversestring") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("asciifolding") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("norwegianlightstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("trim") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("length") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("decimaldigit") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("brazilianstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("capitalization") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("serbiannormalization") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("frenchminimalstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("englishminimalstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("limittokencount") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("hyphenatedwords") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("truncate") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("tokenoffsetpayload") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("galicianminimalstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("russianlightstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("englishpossessive") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("lowercase") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("hindinormalization") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("scandinaviannormalization") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("thaiword") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("synonym") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("indonesianstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("keepword") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("hyphenationcompoundword") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("dictionarycompoundword") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("italianlightstem") -//final case class StandardTokenFilter() extends TokenizerFilterBuilder[TokenFilterFactory]("patterncapturegroup") +final case class TypeTokenFilterBuilder(@JsonProperty("types") types: String, @JsonProperty("useWhitelist") useWhitelist: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("type") +final case class GermanstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("germanstem") +final case class NGramTokenFilterBuilder(@JsonProperty("minGramSize") minGramSize: Integer, @JsonProperty("maxGramSize") maxGramSize: Integer) extends TokenFilterBuilder[TokenFilterFactory]("ngram") +final case class LimittokenpositionTokenFilterBuilder(@JsonProperty("maxTokenPosition") maxTokenPosition: Integer, @JsonProperty("consumeAllTokens") consumeAllTokens: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("limittokenposition") +final case class GreeklowercaseTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("greeklowercase") +final case class LimittokenoffsetTokenFilterBuilder(@JsonProperty("maxStartOffset") maxStartOffset: Integer, @JsonProperty("consumeAllTokens") consumeAllTokens: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("limittokenoffset") +final case class SnowballporterTokenFilterBuilder(@JsonProperty("protected") `protected` : String, @JsonProperty("language") language: String) extends TokenFilterBuilder[TokenFilterFactory]("snowballporter") +final case class TypeaspayloadTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("typeaspayload") +final case class PatternreplaceTokenFilterBuilder(@JsonProperty("pattern") pattern : String, @JsonProperty("replacement") replacement: String) extends TokenFilterBuilder[TokenFilterFactory]("patternreplace") +final case class CjkbigramTokenFilterBuilder(@JsonProperty("han") han: Boolean, @JsonProperty("hiragana") hiragana: Boolean, @JsonProperty("katakana") katakana: Boolean, @JsonProperty("hangul") hangul: Boolean, @JsonProperty("outputUnigrams") outputUnigrams: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("cjkbigram") +final case class KeywordmarkerTokenFilterBuilder(@JsonProperty("protected") `protected` : String, @JsonProperty("pattern") pattern: String, @JsonProperty("ignoreCase") ignoreCase: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("keywordmarker") +final case class SoranistemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("soranistem") +final case class ElisionTokenFilterBuilder(@JsonProperty("articles") articles: String, @JsonProperty("ignoreCase") ignoreCase: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("elision") +final case class HunspellstemTokenFilterBuilder(@JsonProperty("dictionary") dictionary: String, @JsonProperty("affix") affix: String, @JsonProperty("longestOnly") longestOnly: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("hunspellstem") +final case class CodepointcountTokenFilterBuilder(@JsonProperty("min") min: Integer, @JsonProperty("max") max: Integer) extends TokenFilterBuilder[TokenFilterFactory]("codepointcount") +final case class CzechstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("czechstem") +final case class TurkishlowercaseTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("turkishlowercase") +final case class DaterecognizerTokenFilterBuilder(@JsonProperty("datePattern") datePattern: String, @JsonProperty("locale") locale: String) extends TokenFilterBuilder[TokenFilterFactory]("daterecognizer") +final case class PortugueselightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("portugueselightstem") +final case class IrishlowercaseTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("irishlowercase") +final case class CommongramsqueryTokenFilterBuilder( @JsonProperty("words") words: String, @JsonProperty("ignoreCase") ignoreCase: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("commongramsquery") +final case class NumericpayloadTokenFilterBuilder(@JsonProperty("payload") payload: Integer, @JsonProperty("typeMatch") typeMatch: String) extends TokenFilterBuilder[TokenFilterFactory]("numericpayload") +final case class ScandinavianfoldingTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("scandinavianfolding") +final case class GermannormalizationTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("germannormalization") +final case class DelimitedpayloadTokenFilterBuilder(@JsonProperty("encoder") encoder: String, @JsonProperty("delimiter") delimiter: String) extends TokenFilterBuilder[TokenFilterFactory]("delimitedpayload") +final case class WorddelimiterTokenFilterBuilder(@JsonProperty("protected") `protected` : String, @JsonProperty("preserveOriginal") preserveOriginal: Integer, @JsonProperty("splitOnNumerics") splitOnNumerics : Integer, @JsonProperty("splitOnCaseChange") splitOnCaseChange: Integer, @JsonProperty("catenateWords") catenateWords : Integer, @JsonProperty("catenateNumbers") catenateNumbers: Integer, @JsonProperty("catenateAll") catenateAll : Integer, @JsonProperty("generateWordParts") generateWordParts: Integer, @JsonProperty("stemEnglishPosse") stemEnglishPossessive : Integer, @JsonProperty("genNumberParts") generateNumberParts: Integer, @JsonProperty("types") types: String) extends TokenFilterBuilder[TokenFilterFactory]("worddelimiter") +final case class PortugueseminimalstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("portugueseminimalstem") +final case class RemoveduplicatesTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("removeduplicates") +final case class EdgengramTokenFilterBuilder(@JsonProperty("minGramSize") minGramSize: Integer, @JsonProperty("maxGramSize") maxGramSize: Integer) extends TokenFilterBuilder[TokenFilterFactory]("edgengram") +final case class LatvianstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("latvianstem") +final case class PorterstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("porterstem") +final case class FinnishlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("finnishlightstem") +final case class CommongramsTokenFilterBuilder(@JsonProperty("words") words: String, @JsonProperty("ignoreCase") ignoreCase: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("commongrams") +final case class GalicianstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("galicianstem") +final case class KstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("kstem") +final case class AsciifoldingTokenFilterBuilder(@JsonProperty("preserve_original") preserveOriginal:Boolean) extends TokenFilterBuilder[TokenFilterFactory]("asciifolding") +final case class NorwegianlightstemTokenFilterBuilder(@JsonProperty("variant") variant: String) extends TokenFilterBuilder[TokenFilterFactory]("norwegianlightstem") +final case class TrimTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("trim") +final case class LengthTokenFilterBuilder(@JsonProperty("min") min: Integer, @JsonProperty("max") max: Integer) extends TokenFilterBuilder[TokenFilterFactory]("length") +final case class DecimaldigitTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("decimaldigit") +final case class BrazilianstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("brazilianstem") +final case class CapitalizationTokenFilterBuilder(@JsonProperty("onlyFirstWord") onlyFirstWord:Boolean, @JsonProperty("keep") keep:String, @JsonProperty("keepIgnoreCase") keepIgnoreCase:Boolean, @JsonProperty("okPrefix") okPrefix:Boolean) extends TokenFilterBuilder[TokenFilterFactory]("capitalization") +final case class SerbiannormalizationTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("serbiannormalization") +final case class FrenchminimalstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("frenchminimalstem") +final case class EnglishminimalstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("englishminimalstem") +final case class LimittokencountTokenFilterBuilder(@JsonProperty("maxTokenCount") maxTokenCount: Integer, @JsonProperty("consumeAllTokens") consumeAllTokens:Boolean) extends TokenFilterBuilder[TokenFilterFactory]("limittokencount") +final case class HyphenatedwordsTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("hyphenatedwords") +final case class TruncateTokenFilterBuilder(@JsonProperty("prefixLength") prefixLength: Integer) extends TokenFilterBuilder[TokenFilterFactory]("truncate") +final case class TokenoffsetpayloadTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("tokenoffsetpayload") +final case class GalicianminimalstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("galicianminimalstem") +final case class RussianlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("russianlightstem") +final case class EnglishpossessiveTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("englishpossessive") +final case class LowercaseTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("lowercase") +final case class HindinormalizationTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("hindinormalization") +final case class ScandinaviannormalizationTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("scandinaviannormalization") +final case class ThaiwordTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("thaiword") +final case class SynonymTokenFilterBuilder(@JsonProperty("synonyms") synonyms: String, @JsonProperty("format") format: String, @JsonProperty("ignoreCase") ignoreCase: Boolean, @JsonProperty("expand") expand: Boolean, @JsonProperty("tokenizerFactory") tokenizerFactory: String) extends TokenFilterBuilder[TokenFilterFactory]("synonym") +final case class IndonesianstemTokenFilterBuilder(@JsonProperty("stemDerivational") stemDerivational: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("indonesianstem") +final case class KeepwordTokenFilterBuilder(@JsonProperty("words") words: String, @JsonProperty("ignoreCase") ignoreCase: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("keepword") +final case class HyphenationcompoundwordTokenFilterBuilder(@JsonProperty("hyphenator") hyphenator: String, @JsonProperty("encoding") encoding: String, @JsonProperty("dictionary") dictionary: String, @JsonProperty("minWordSize") minWordSize: Integer, @JsonProperty("minSubwordSize") minSubwordSize: Integer, @JsonProperty("maxSubwordSize") maxSubwordSize: Integer, @JsonProperty("onlyLongestMatch") onlyLongestMatch: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("hyphenationcompoundword") +final case class DictionarycompoundwordTokenFilterBuilder(@JsonProperty("dictionary") dictionary: String, @JsonProperty("minWordSize") minWordSize: Integer, @JsonProperty("minSubwordSize") minSubwordSize: Integer, @JsonProperty("maxSubwordSize") maxSubwordSize: Integer, @JsonProperty("onlyLongestMatch") onlyLongestMatch: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("dictionarycompoundword") +final case class ItalianlightstemTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("italianlightstem") +final case class PatterncapturegroupTokenFilterBuilder(@JsonProperty("pattern") pattern: String, @JsonProperty("preserve_original") preserve_original: Boolean) extends TokenFilterBuilder[TokenFilterFactory]("patterncapturegroup") +final case class ReverseStringTokenFilterBuilder() extends TokenFilterBuilder[TokenFilterFactory]("reversestring") + + From b45479e079c7c1be99e752d313a9b5d4759c4aa4 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Tue, 13 Jun 2017 15:40:31 +0200 Subject: [PATCH 27/40] Add new TokenFilter documentation --- doc/documentation.rst | 270 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 251 insertions(+), 19 deletions(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index 5e6fff2db..f1118a55a 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -955,13 +955,13 @@ CharFilter +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Name | Option | Value type | Default value | Mandatory | CharFilter class name | +=============================+==============+==============+================+============+==========================================================================================================================================================================================+ -| pattern | pattern | string | null | No | `PatternReplaceCharFilter `__ | +| pattern | pattern | string | | No | `PatternReplaceCharFilterFactory `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| htmlstrip | escapedtags | string[] | null | No | `HTMLStripCharFilter `__ | +| htmlstrip | escapedtags | string[] | | No | `HTMLStripCharFilterFactory `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| mapping | mapping | string | null | No | `MappingCharFilter `__ | +| mapping | mapping | string | | No | `MappingCharFilterFactory `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| persian | | | | No | `PersianCharFilter `__ | +| persian | | | | No | `PersianCharFilterFactory `__ | +-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Tokenizer @@ -969,23 +969,23 @@ Tokenizer +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Name | Option | Value type | Default value | Mandatory | Tokenizer class name | +=============================+===================+==============+================+============+==========================================================================================================================================================================================+ -| classic | max_token_length | integer | 256 | No | `ClassicTokenizer `__ | +| classic | max_token_length | integer | 256 | No | `ClassicTokenizerFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| edge_ngram | min_gram_size | integer | 1 | No | `EdgeNGramTokenizer `__ | +| edge_ngram | min_gram_size | integer | 1 | No | `EdgeNGramTokenizerFactory `__ | | +-------------------+--------------+----------------+------------+ | | | max_gram_size | integer | 2 | No | | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| keyword | | | | | `KeywordTokenizer `__ | +| keyword | | | | | `KeywordTokenizerFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| letter | | | | | `LetterTokenizer `__ | +| letter | | | | | `LetterTokenizerFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| lower_case | | | | | `LowerCaseTokenizer `__ | +| lower_case | | | | | `LowerCaseTokenizerFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ngram | min_gram_size | integer | 1 | No | `NGramTokenizer `__ | +| ngram | min_gram_size | integer | 1 | No | `NGramTokenizerFactory `__ | | +-------------------+--------------+----------------+------------+ | | | max_gram_size | integer | 2 | No | | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| path_hierarchy | reverse | boolean | false | No | `PathHierarchyTokenizer `__ | +| path_hierarchy | reverse | boolean | false | No | `PathHierarchyTokenizerFactory `__ | | +-------------------+--------------+----------------+------------+ | | | delimiter | char | / | No | | | +-------------------+--------------+----------------+------------+ | @@ -993,19 +993,19 @@ Tokenizer | +-------------------+--------------+----------------+------------+ | | | skip | integer | 0 | No | | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| pattern | pattern | string | null | No | `PatternTokenizer `__ | +| pattern | pattern | string | null | No | `PatternTokenizerFactory `__ | | +-------------------+--------------+----------------+------------+ | | | group | integer | -1 | No | | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| standard | max_token_length | integer | 256 | No | `StandardTokenizer `__ | +| standard | max_token_length | integer | 256 | No | `StandardTokenizerFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| uax29_url_email | max_token_length | integer | 256 | No | `UAX29URLEmailTokenizer `__ | +| uax29_url_email | max_token_length | integer | 256 | No | `UAX29URLEmailTokenizerFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| thai | | | | | `ThaiTokenizer `__ | +| thai | | | | | `ThaiTokenizerFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| whitespace | rule | string | java | No | `WhitespaceTokenizer `__ | +| whitespace | rule | string | java | No | `WhitespaceTokenizerFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| wikipedia | | | | | `WikipediaTokenizer `__ | +| wikipedia | | | | | `WikipediaTokenizerFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ TokenFilter @@ -1036,6 +1036,14 @@ TokenFilter | shingle | min_shingle_size | integer | 2 | No | `ShingleFilter `__ | | +-------------------+--------------+----------------+-------+ | | | max_shingle_size | integer | 2 | No | | +| +-------------------+--------------+----------------+-------+ | +| | outputUnigrams | boolean | false | No | | +| +-------------------+--------------+----------------+-------+ | +| | OUIfNoShingles | boolean | false | No | | +| +-------------------+--------------+----------------+-------+ | +| | tokenSeparator | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | fillerToken | string | | No | | +-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | stemmeroverride | dictionary | string | | No | `StemmerOverrideFilter `__ | | +-------------------+--------------+----------------+-------+ | @@ -1073,11 +1081,235 @@ TokenFilter +-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | germanlightstem | | | | | `GermanLightStemFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| type | | | | | `TypeTokenFilterFactory `__ | +| type | types | string | | | `TypeTokenFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | useWhitelist | boolean | false | No | | +-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | asciifolding | preserveOriginal | boolean | false | No | `ASCIIFoldingFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| lowercase | | | | | `GreekLowerCaseFilterFactory `__ | +| lowercase | | | | | `LowerCaseFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| germanstem | | | | | `GermanStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ngram | minGramSize | integer | 1 | No | `NGramFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | maxGramSize | integer | 2 | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| limittokenposition | maxTokenPosition | integer | | Yes | `LimitTokenPositionFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | consumeAllTokens | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| greeklowercase | | | | | `GreekLowerCaseFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| limittokenoffset | maxStartOffset | integer | | Yes | `LimitTokenOffsetFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | consumeAllTokens | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| snowballporter | protected | string | | No | `SnowballPorterFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | language | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| typeaspayload | | | | | `TypeAsPayloadTokenFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| patternreplace | pattern | string | | No | `PatternReplaceFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | replacement | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| cjkbigram | han | boolean | | | `CJKBigramFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | hiragana | boolean | | No | | +| +-------------------+--------------+----------------+-------+ | +| | katakana | boolean | | No | | +| +-------------------+--------------+----------------+-------+ | +| | hangul | boolean | | No | | +| +-------------------+--------------+----------------+-------+ | +| | outputUnigrams | boolean | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| keywordmarker | protected | string | | | `KeywordMarkerFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | pattern | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | ignoreCase | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| soranistem | | | | | `SoraniStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| elision | articles | string | | No | `ElisionFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | ignoreCase | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hunspellstem | dictionary | string | | No | `HunspellStemFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | affix | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | longestOnly | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| codepointcount | min | integer | 0 | No | `CodepointCountFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | max | integer | 1 | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| czechstem | | | | | `CzechStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| turkishlowercase | | | | | `TurkishLowerCaseFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| daterecognizer | datePattern | string | | No | `DateRecognizerFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | locale | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| portugueselightstem | | | | | `PortugueseLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| irishlowercase | | | | | `IrishLowerCaseFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| commongramsquery | words | string | | No | `CommonGramsQueryFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | ignoreCase | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| numericpayload | payload | integer | | No | `NumericPayloadTokenFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | typeMatch | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| scandinavianfolding | | | | | `ScandinavianFoldingFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| germannormalization | | | | | `GermanNormalizationFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| delimitedpayload | encoder | string | | No | `GreekLowerCaseFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | delimiter | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| worddelimiter | protected | string | | No | `WordDelimiterFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | preserveOriginal | integer | 0 | No | | +| +-------------------+--------------+----------------+-------+ | +| | splitOnNumerics | integer | 1 | No | | +| +-------------------+--------------+----------------+-------+ | +| | splitOnCaseChange | integer | 1 | No | | +| +-------------------+--------------+----------------+-------+ | +| | catenateWords | integer | 0 | No | | +| +-------------------+--------------+----------------+-------+ | +| | catenateNumbers | integer | 0 | No | | +| +-------------------+--------------+----------------+-------+ | +| | catenateAll | integer | 0 | No | | +| +-------------------+--------------+----------------+-------+ | +| | generateWordParts | integer | 1 | No | | +| +-------------------+--------------+----------------+-------+ | +| | genNumberParts | integer | 1 | No | | +| +-------------------+--------------+----------------+-------+ | +| | stemEnglishPosse | integer | 1 | No | | +| +-------------------+--------------+----------------+-------+ | +| | types | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| portugueseminimalstem | | | | | `PortugueseMinimalStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| removeduplicates | | | | | `RemoveDuplicatesTokenFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| edgengram | minGramSize | integer | 1 | No | `EdgeNGramFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | maxGramSize | integer | 2 | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| latvianstem | | | | | `LatvianStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| finnishlightstem | | | | | `FinnishLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| commongrams | words | string | | No | `CommonGramsFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | ignoreCase | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| galicianstem | | | | | `GalicianStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| kstem | | | | | `KStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| norwegianlightstem | variant | string | | No | `NorwegianLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| trim | | | | | `TrimFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| length | min | integer | 0 | No | `LengthFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | max | integer | 1 | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| decimaldigit | | | | | `DecimalDigitFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| brazilianstem | | | | | `BrazilianStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| capitalization | onlyFirstWord | boolean | false | No | `CapitalizationFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | keep | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | keepIgnoreCase | boolean | false | No | | +| +-------------------+--------------+----------------+-------+ | +| | okPrefix | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| serbiannormalization | | | | | `SerbianNormalizationFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| frenchminimalstem | | | | | `FrenchMinimalStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| englishminimalstem | | | | | `EnglishMinimalStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| limittokencount | maxTokenCount | integer | | No | `LimitTokenCountFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | consumeAllTokens | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hyphenatedwords | | | | | `HyphenatedWordsFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| truncate | prefixLength | integer | | No | `TruncateTokenFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| tokenoffsetpayload | | | | | `TokenOffsetPayloadTokenFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| galicianminimalstem | | | | | `GalicianMinimalStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| russianlightstem | | | | | `RussianLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hindinormalization | | | | | `HindiNormalizationFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| scandinaviannormalization | | | | | `ScandinavianNormalizationFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| thaiword | | | | | `ThaiWordFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| synonym | synonyms | string | | No | `SynonymFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | format | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | ignoreCase | boolean | false | No | | +| +-------------------+--------------+----------------+-------+ | +| | expand | boolean | false | No | | +| +-------------------+--------------+----------------+-------+ | +| | tokenizerFactory | string | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| indonesianstem | stemDerivational | boolean | false | No | `IndonesianStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| keepword | words | string | | No | `KeepWordFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | ignoreCase | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hyphenationcompoundword | hyphenator | string | | No | `HyphenationCompoundWordTokenFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | encoding | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | dictionary | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | minWordSize | integer | | No | | +| +-------------------+--------------+----------------+-------+ | +| | minSubwordSize | integer | | No | | +| +-------------------+--------------+----------------+-------+ | +| | maxSubwordSize | integer | | No | | +| +-------------------+--------------+----------------+-------+ | +| | onlyLongestMatch | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| dictionarycompoundword | dictionary | string | | | `DictionaryCompoundWordTokenFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | minWordSize | integer | | No | | +| +-------------------+--------------+----------------+-------+ | +| | minSubwordSize | integer | | No | | +| +-------------------+--------------+----------------+-------+ | +| | maxSubwordSize | integer | | No | | +| +-------------------+--------------+----------------+-------+ | +| | onlyLongestMatch | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| italianlightstem | | | | | `ItalianLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| patterncapturegroup | pattern | string | | No | `PatternCaptureGroupFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | preserve_original | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| reversestring | | | | | `ReverseStringFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Mappers From f80c83feacf34e4627b3e819f5fa6ebf59f8e916 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Tue, 13 Jun 2017 15:55:20 +0200 Subject: [PATCH 28/40] Fix rst format --- doc/documentation.rst | 623 +++++++++++++++++++++--------------------- 1 file changed, 311 insertions(+), 312 deletions(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index f1118a55a..741ec3478 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -952,17 +952,17 @@ Under the hood it uses the Lucene's factory classes TokenizerFactory, TokenFilte CharFilter ~~~~~~~~~~ -+-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Name | Option | Value type | Default value | Mandatory | CharFilter class name | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Name | Option | Value type | Default value | Mandatory | CharFilter class name | +=============================+==============+==============+================+============+==========================================================================================================================================================================================+ -| pattern | pattern | string | | No | `PatternReplaceCharFilterFactory `__ | -+-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| htmlstrip | escapedtags | string[] | | No | `HTMLStripCharFilterFactory `__ | -+-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| mapping | mapping | string | | No | `MappingCharFilterFactory `__ | -+-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| persian | | | | No | `PersianCharFilterFactory `__ | -+-----------------------------+--------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| pattern | pattern | string | | No | `PatternReplaceCharFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| htmlstrip | escapedtags | string[] | | No | `HTMLStripCharFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| mapping | mapping | string | | No | `MappingCharFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| persian | | | | No | `PersianCharFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Tokenizer ~~~~~~~~~ @@ -1010,308 +1010,307 @@ Tokenizer TokenFilter ~~~~~~~~~~~ -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Name | Option | Value type | Default value | Mand. | TokenFilter class name | -+=============================+===================+==============+================+=======+========================================================================================================================================================================================================+ -| standard | | | | | `StandardFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| apostrophe | | | | | `ApostropheFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| arabicnormalization | | | | | `ArabicNormalizationFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| arabicstem | | | | | `ArabicStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| indicnormalization | | | | | `IndicNormalizationFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| portuguesestem | | | | | `PortugueseStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| germanminimalstem | | | | | `GermanMinimalStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| uppercase | | | | | `UpperCaseFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| keywordrepeat | | | | | `KeywordRepeatFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| classic | | | | | `ClassicFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| shingle | min_shingle_size | integer | 2 | No | `ShingleFilter `__ | -| +-------------------+--------------+----------------+-------+ | -| | max_shingle_size | integer | 2 | No | | -| +-------------------+--------------+----------------+-------+ | -| | outputUnigrams | boolean | false | No | | -| +-------------------+--------------+----------------+-------+ | -| | OUIfNoShingles | boolean | false | No | | -| +-------------------+--------------+----------------+-------+ | -| | tokenSeparator | string | | No | | -| +-------------------+--------------+----------------+-------+ | -| | fillerToken | string | | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| stemmeroverride | dictionary | string | | No | `StemmerOverrideFilter `__ | -| +-------------------+--------------+----------------+-------+ | -| | ignore_case | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| bulgarianstem | | | | | `BulgarianStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| swedishlightstem | | | | | `SwedishLightStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| frenchlightstem | | | | | `FrenchLightStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| cjkwidth | | | | | `CJKWidthFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| greekstem | | | | | `GreekStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| stop | words | string | | No | `StopFilter `__ | -| +-------------------+--------------+----------------+-------+ | -| | format | string | | No | | -| +-------------------+--------------+----------------+-------+ | -| | ignore_case | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| hindistem | | | | | `HindiStemFilter `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| fingerprint | maxOutputTokenSize| integer | 1024 | No | `FingerprintFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | separator | char | " " | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| spanishlightstem | | | | | `SpanishLightStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| hungarianlightstem | | | | | `HungarianLightStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| norwegianminimalstem | | | | | `NorwegianMinimalStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| persiannormalization | | | | | `PersianNormalizationFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| germanlightstem | | | | | `GermanLightStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| type | types | string | | | `TypeTokenFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | useWhitelist | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| asciifolding | preserveOriginal | boolean | false | No | `ASCIIFoldingFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| lowercase | | | | | `LowerCaseFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| germanstem | | | | | `GermanStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ngram | minGramSize | integer | 1 | No | `NGramFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | maxGramSize | integer | 2 | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| limittokenposition | maxTokenPosition | integer | | Yes | `LimitTokenPositionFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | consumeAllTokens | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| greeklowercase | | | | | `GreekLowerCaseFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| limittokenoffset | maxStartOffset | integer | | Yes | `LimitTokenOffsetFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | consumeAllTokens | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| snowballporter | protected | string | | No | `SnowballPorterFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | language | string | | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| typeaspayload | | | | | `TypeAsPayloadTokenFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| patternreplace | pattern | string | | No | `PatternReplaceFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | replacement | string | | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| cjkbigram | han | boolean | | | `CJKBigramFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | hiragana | boolean | | No | | -| +-------------------+--------------+----------------+-------+ | -| | katakana | boolean | | No | | -| +-------------------+--------------+----------------+-------+ | -| | hangul | boolean | | No | | -| +-------------------+--------------+----------------+-------+ | -| | outputUnigrams | boolean | | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| keywordmarker | protected | string | | | `KeywordMarkerFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | pattern | string | | No | | -| +-------------------+--------------+----------------+-------+ | -| | ignoreCase | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| soranistem | | | | | `SoraniStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| elision | articles | string | | No | `ElisionFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | ignoreCase | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| hunspellstem | dictionary | string | | No | `HunspellStemFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | affix | string | | No | | -| +-------------------+--------------+----------------+-------+ | -| | longestOnly | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| codepointcount | min | integer | 0 | No | `CodepointCountFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | max | integer | 1 | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| czechstem | | | | | `CzechStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| turkishlowercase | | | | | `TurkishLowerCaseFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| daterecognizer | datePattern | string | | No | `DateRecognizerFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | locale | string | | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| portugueselightstem | | | | | `PortugueseLightStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| irishlowercase | | | | | `IrishLowerCaseFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| commongramsquery | words | string | | No | `CommonGramsQueryFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | ignoreCase | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| numericpayload | payload | integer | | No | `NumericPayloadTokenFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | typeMatch | string | | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| scandinavianfolding | | | | | `ScandinavianFoldingFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| germannormalization | | | | | `GermanNormalizationFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| delimitedpayload | encoder | string | | No | `GreekLowerCaseFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | delimiter | string | | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| worddelimiter | protected | string | | No | `WordDelimiterFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | preserveOriginal | integer | 0 | No | | -| +-------------------+--------------+----------------+-------+ | -| | splitOnNumerics | integer | 1 | No | | -| +-------------------+--------------+----------------+-------+ | -| | splitOnCaseChange | integer | 1 | No | | -| +-------------------+--------------+----------------+-------+ | -| | catenateWords | integer | 0 | No | | -| +-------------------+--------------+----------------+-------+ | -| | catenateNumbers | integer | 0 | No | | -| +-------------------+--------------+----------------+-------+ | -| | catenateAll | integer | 0 | No | | -| +-------------------+--------------+----------------+-------+ | -| | generateWordParts | integer | 1 | No | | -| +-------------------+--------------+----------------+-------+ | -| | genNumberParts | integer | 1 | No | | -| +-------------------+--------------+----------------+-------+ | -| | stemEnglishPosse | integer | 1 | No | | -| +-------------------+--------------+----------------+-------+ | -| | types | string | | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| portugueseminimalstem | | | | | `PortugueseMinimalStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| removeduplicates | | | | | `RemoveDuplicatesTokenFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| edgengram | minGramSize | integer | 1 | No | `EdgeNGramFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | maxGramSize | integer | 2 | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| latvianstem | | | | | `LatvianStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| finnishlightstem | | | | | `FinnishLightStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| commongrams | words | string | | No | `CommonGramsFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | ignoreCase | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| galicianstem | | | | | `GalicianStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| kstem | | | | | `KStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| norwegianlightstem | variant | string | | No | `NorwegianLightStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| trim | | | | | `TrimFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| length | min | integer | 0 | No | `LengthFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | max | integer | 1 | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| decimaldigit | | | | | `DecimalDigitFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| brazilianstem | | | | | `BrazilianStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| capitalization | onlyFirstWord | boolean | false | No | `CapitalizationFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | keep | string | | No | | -| +-------------------+--------------+----------------+-------+ | -| | keepIgnoreCase | boolean | false | No | | -| +-------------------+--------------+----------------+-------+ | -| | okPrefix | string | | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| serbiannormalization | | | | | `SerbianNormalizationFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| frenchminimalstem | | | | | `FrenchMinimalStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| englishminimalstem | | | | | `EnglishMinimalStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| limittokencount | maxTokenCount | integer | | No | `LimitTokenCountFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | consumeAllTokens | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| hyphenatedwords | | | | | `HyphenatedWordsFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| truncate | prefixLength | integer | | No | `TruncateTokenFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| tokenoffsetpayload | | | | | `TokenOffsetPayloadTokenFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| galicianminimalstem | | | | | `GalicianMinimalStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| russianlightstem | | | | | `RussianLightStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| hindinormalization | | | | | `HindiNormalizationFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| scandinaviannormalization | | | | | `ScandinavianNormalizationFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| thaiword | | | | | `ThaiWordFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| synonym | synonyms | string | | No | `SynonymFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | format | string | | No | | -| +-------------------+--------------+----------------+-------+ | -| | ignoreCase | boolean | false | No | | -| +-------------------+--------------+----------------+-------+ | -| | expand | boolean | false | No | | -| +-------------------+--------------+----------------+-------+ | -| | tokenizerFactory | string | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| indonesianstem | stemDerivational | boolean | false | No | `IndonesianStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| keepword | words | string | | No | `KeepWordFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | ignoreCase | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| hyphenationcompoundword | hyphenator | string | | No | `HyphenationCompoundWordTokenFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | encoding | string | | No | | -| +-------------------+--------------+----------------+-------+ | -| | dictionary | string | | No | | -| +-------------------+--------------+----------------+-------+ | -| | minWordSize | integer | | No | | -| +-------------------+--------------+----------------+-------+ | -| | minSubwordSize | integer | | No | | -| +-------------------+--------------+----------------+-------+ | -| | maxSubwordSize | integer | | No | | -| +-------------------+--------------+----------------+-------+ | -| | onlyLongestMatch | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| dictionarycompoundword | dictionary | string | | | `DictionaryCompoundWordTokenFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | minWordSize | integer | | No | | -| +-------------------+--------------+----------------+-------+ | -| | minSubwordSize | integer | | No | | -| +-------------------+--------------+----------------+-------+ | -| | maxSubwordSize | integer | | No | | -| +-------------------+--------------+----------------+-------+ | -| | onlyLongestMatch | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| italianlightstem | | | | | `ItalianLightStemFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| patterncapturegroup | pattern | string | | No | `PatternCaptureGroupFilterFactory `__ | -| +-------------------+--------------+----------------+-------+ | -| | preserve_original | boolean | false | No | | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| reversestring | | | | | `ReverseStringFilterFactory `__ | -+-----------------------------+-------------------+--------------+----------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Name | Option | Value type | Default value | Mand. | TokenFilter class name | ++=============================+===================+==============+================+=======+===============================================================================================================================================================================================+ +| standard | | | | | `StandardFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| apostrophe | | | | | `ApostropheFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| arabicnormalization | | | | | `ArabicNormalizationFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| arabicstem | | | | | `ArabicStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| indicnormalization | | | | | `IndicNormalizationFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| portuguesestem | | | | | `PortugueseStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| germanminimalstem | | | | | `GermanMinimalStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| uppercase | | | | | `UpperCaseFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| keywordrepeat | | | | | `KeywordRepeatFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| classic | | | | | `ClassicFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| shingle | min_shingle_size | integer | 2 | No | `ShingleFilter `__ | +| +-------------------+--------------+----------------+-------+ | +| | max_shingle_size | integer | 2 | No | | +| +-------------------+--------------+----------------+-------+ | +| | outputUnigrams | boolean | false | No | | +| +-------------------+--------------+----------------+-------+ | +| | OUIfNoShingles | boolean | false | No | | +| +-------------------+--------------+----------------+-------+ | +| | tokenSeparator | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | fillerToken | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| stemmeroverride | dictionary | string | | No | `StemmerOverrideFilter `__ | +| +-------------------+--------------+----------------+-------+ | +| | ignore_case | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| bulgarianstem | | | | | `BulgarianStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| swedishlightstem | | | | | `SwedishLightStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| frenchlightstem | | | | | `FrenchLightStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| cjkwidth | | | | | `CJKWidthFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| greekstem | | | | | `GreekStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| stop | words | string | | No | `StopFilter `__ | +| +-------------------+--------------+----------------+-------+ | +| | format | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | ignore_case | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hindistem | | | | | `HindiStemFilter `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| fingerprint | maxOutputTokenSize| integer | 1024 | No | `FingerprintFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | separator | char | " " | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| spanishlightstem | | | | | `SpanishLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hungarianlightstem | | | | | `HungarianLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| norwegianminimalstem | | | | | `NorwegianMinimalStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| persiannormalization | | | | | `PersianNormalizationFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| germanlightstem | | | | | `GermanLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| type | types | string | | | `TypeTokenFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | useWhitelist | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| asciifolding | preserveOriginal | boolean | false | No | `ASCIIFoldingFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| lowercase | | | | | `LowerCaseFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| germanstem | | | | | `GermanStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ngram | minGramSize | integer | 1 | No | `NGramFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | maxGramSize | integer | 2 | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| limittokenposition | maxTokenPosition | integer | | Yes | `LimitTokenPositionFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | consumeAllTokens | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| greeklowercase | | | | | `GreekLowerCaseFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| limittokenoffset | maxStartOffset | integer | | Yes | `LimitTokenOffsetFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | consumeAllTokens | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| snowballporter | protected | string | | No | `SnowballPorterFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | language | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| typeaspayload | | | | | `TypeAsPayloadTokenFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| patternreplace | pattern | string | | No | `PatternReplaceFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | replacement | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| cjkbigram | han | boolean | | | `CJKBigramFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | hiragana | boolean | | No | | +| +-------------------+--------------+----------------+-------+ | +| | katakana | boolean | | No | | +| +-------------------+--------------+----------------+-------+ | +| | hangul | boolean | | No | | +| +-------------------+--------------+----------------+-------+ | +| | outputUnigrams | boolean | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| keywordmarker | protected | string | | | `KeywordMarkerFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | pattern | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | ignoreCase | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| soranistem | | | | | `SoraniStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| elision | articles | string | | No | `ElisionFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | ignoreCase | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hunspellstem | dictionary | string | | No | `HunspellStemFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | affix | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | longestOnly | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| codepointcount | min | integer | 0 | No | `CodepointCountFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | max | integer | 1 | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| czechstem | | | | | `CzechStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| turkishlowercase | | | | | `TurkishLowerCaseFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| daterecognizer | datePattern | string | | No | `DateRecognizerFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | locale | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| portugueselightstem | | | | | `PortugueseLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| irishlowercase | | | | | `IrishLowerCaseFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| commongramsquery | words | string | | No | `CommonGramsQueryFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | ignoreCase | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| numericpayload | payload | integer | | No | `NumericPayloadTokenFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | typeMatch | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| scandinavianfolding | | | | | `ScandinavianFoldingFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| germannormalization | | | | | `GermanNormalizationFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| delimitedpayload | encoder | string | | No | `GreekLowerCaseFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | delimiter | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| worddelimiter | protected | string | | No | `WordDelimiterFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | preserveOriginal | integer | 0 | No | | +| +-------------------+--------------+----------------+-------+ | +| | splitOnNumerics | integer | 1 | No | | +| +-------------------+--------------+----------------+-------+ | +| | splitOnCaseChange | integer | 1 | No | | +| +-------------------+--------------+----------------+-------+ | +| | catenateWords | integer | 0 | No | | +| +-------------------+--------------+----------------+-------+ | +| | catenateNumbers | integer | 0 | No | | +| +-------------------+--------------+----------------+-------+ | +| | catenateAll | integer | 0 | No | | +| +-------------------+--------------+----------------+-------+ | +| | generateWordParts | integer | 1 | No | | +| +-------------------+--------------+----------------+-------+ | +| | genNumberParts | integer | 1 | No | | +| +-------------------+--------------+----------------+-------+ | +| | stemEnglishPosse | integer | 1 | No | | +| +-------------------+--------------+----------------+-------+ | +| | types | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| portugueseminimalstem | | | | | `PortugueseMinimalStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| removeduplicates | | | | | `RemoveDuplicatesTokenFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| edgengram | minGramSize | integer | 1 | No | `EdgeNGramFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | maxGramSize | integer | 2 | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| latvianstem | | | | | `LatvianStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| finnishlightstem | | | | | `FinnishLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| commongrams | words | string | | No | `CommonGramsFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | ignoreCase | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| galicianstem | | | | | `GalicianStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| kstem | | | | | `KStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| norwegianlightstem | variant | string | | No | `NorwegianLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| trim | | | | | `TrimFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| length | min | integer | 0 | No | `LengthFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | max | integer | 1 | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| decimaldigit | | | | | `DecimalDigitFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| brazilianstem | | | | | `BrazilianStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| capitalization | onlyFirstWord | boolean | false | No | `CapitalizationFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | keep | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | keepIgnoreCase | boolean | false | No | | +| +-------------------+--------------+----------------+-------+ | +| | okPrefix | string | | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| serbiannormalization | | | | | `SerbianNormalizationFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| frenchminimalstem | | | | | `FrenchMinimalStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| englishminimalstem | | | | | `EnglishMinimalStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| limittokencount | maxTokenCount | integer | | No | `LimitTokenCountFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | consumeAllTokens | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hyphenatedwords | | | | | `HyphenatedWordsFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| truncate | prefixLength | integer | | No | `TruncateTokenFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| tokenoffsetpayload | | | | | `TokenOffsetPayloadTokenFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| galicianminimalstem | | | | | `GalicianMinimalStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| russianlightstem | | | | | `RussianLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hindinormalization | | | | | `HindiNormalizationFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| scandinaviannormalization | | | | | `ScandinavianNormalizationFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| thaiword | | | | | `ThaiWordFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| synonym | synonyms | string | | No | `SynonymFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | format | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | ignoreCase | boolean | false | No | | +| +-------------------+--------------+----------------+-------+ | +| | expand | boolean | false | No | | +| +-------------------+--------------+----------------+-------+ | +| | tokenizerFactory | string | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| indonesianstem | stemDerivational | boolean | false | No | `IndonesianStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| keepword | words | string | | No | `KeepWordFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | ignoreCase | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| hyphenationcompoundword | hyphenator | string | | No | `HyphenationCompoundWordTokenFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | encoding | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | dictionary | string | | No | | +| +-------------------+--------------+----------------+-------+ | +| | minWordSize | integer | | No | | +| +-------------------+--------------+----------------+-------+ | +| | minSubwordSize | integer | | No | | +| +-------------------+--------------+----------------+-------+ | +| | maxSubwordSize | integer | | No | | +| +-------------------+--------------+----------------+-------+ | +| | onlyLongestMatch | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| dictionarycompoundword | dictionary | string | | | `DictionaryCompoundWordTokenFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | minWordSize | integer | | No | | +| +-------------------+--------------+----------------+-------+ | +| | minSubwordSize | integer | | No | | +| +-------------------+--------------+----------------+-------+ | +| | maxSubwordSize | integer | | No | | +| +-------------------+--------------+----------------+-------+ | +| | onlyLongestMatch | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| italianlightstem | | | | | `ItalianLightStemFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| patterncapturegroup | pattern | string | | No | `PatternCaptureGroupFilterFactory `__ | +| +-------------------+--------------+----------------+-------+ | +| | preserve_original | boolean | false | No | | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| reversestring | | | | | `ReverseStringFilterFactory `__ | ++-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Mappers ======= From 1c55d099a7643a68cace102f71153fe195071eb7 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Tue, 13 Jun 2017 16:00:13 +0200 Subject: [PATCH 29/40] Fix rst format --- doc/documentation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index 741ec3478..87676bbe0 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -954,7 +954,7 @@ CharFilter +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Name | Option | Value type | Default value | Mandatory | CharFilter class name | -+=============================+==============+==============+================+============+==========================================================================================================================================================================================+ ++=============================+===================+==============+================+============+==========================================================================================================================================================================================+ | pattern | pattern | string | | No | `PatternReplaceCharFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | htmlstrip | escapedtags | string[] | | No | `HTMLStripCharFilterFactory `__ | From 29d2fa4ac36b9b99f4a0fab5609581457cedae77 Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Tue, 13 Jun 2017 16:03:33 +0200 Subject: [PATCH 30/40] Fix persian charfilter --- doc/documentation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index 87676bbe0..88fca186a 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -961,7 +961,7 @@ CharFilter +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | mapping | mapping | string | | No | `MappingCharFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| persian | | | | No | `PersianCharFilterFactory `__ | +| persian | | | | | `PersianCharFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Tokenizer From cf88e27669659b8999b80fc350f96003dd605b6b Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Wed, 14 Jun 2017 13:11:31 +0200 Subject: [PATCH 31/40] Fix documentation --- doc/documentation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/documentation.rst b/doc/documentation.rst index 88fca186a..48ef7ff76 100644 --- a/doc/documentation.rst +++ b/doc/documentation.rst @@ -1217,7 +1217,7 @@ TokenFilter +-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | kstem | | | | | `KStemFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| norwegianlightstem | variant | string | | No | `NorwegianLightStemFilterFactory `__ | +| norwegianlightstem | variant | string | nb | No | `NorwegianLightStemFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | trim | | | | | `TrimFilterFactory `__ | +-----------------------------+-------------------+--------------+----------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ From a2f70855a09aab839ab151edc9ca9fa6fb787b9f Mon Sep 17 00:00:00 2001 From: jpgilaberte Date: Wed, 14 Jun 2017 13:12:01 +0200 Subject: [PATCH 32/40] Add char filter test --- .../analysis/charFilter/CharFilterTest.java | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/CharFilterTest.java diff --git a/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/CharFilterTest.java b/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/CharFilterTest.java new file mode 100644 index 000000000..1f83e45df --- /dev/null +++ b/builder/src/test/java/com/stratio/cassandra/lucene/builder/index/schema/analysis/charFilter/CharFilterTest.java @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2014 Stratio (http://stratio.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.stratio.cassandra.lucene.builder.index.schema.analysis.charFilter; + +import org.junit.Test; +import java.util.ArrayList; +import static org.junit.Assert.*; + + +/** + * Created by jpgilaberte on 14/06/17. + */ +public class CharFilterTest { + + @Test + public void testHtmlStripCharFilter() { + ArrayList listTags = new ArrayList<>(); + listTags.add("
"); + listTags.add("