Skip to content

Commit 4f61b42

Browse files
committedSep 1, 2015
Merge pull request apache#1578 from b-slim/fix_extraction_filter_2
Fix UT and documentation to the extraction filter
2 parents d89b0fa + 7549f02 commit 4f61b42

File tree

9 files changed

+490
-35
lines changed

9 files changed

+490
-35
lines changed
 

‎docs/content/querying/aggregations.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ A filtered aggregator wraps any given aggregator, but only aggregates the values
185185

186186
This makes it possible to compute the results of a filtered and an unfiltered aggregation simultaneously, without having to issue multiple queries, and use both results as part of post-aggregations.
187187

188-
*Limitations:* The filtered aggregator currently only supports 'or', 'and', 'selector' and 'not' filters, i.e. matching one or multiple dimensions against a single value.
188+
*Limitations:* The filtered aggregator currently only supports 'or', 'and', 'selector', 'not' and 'Extraction' filters, i.e. matching one or multiple dimensions against a single value.
189189

190190
*Note:* If only the filtered results are required, consider putting the filter on the query itself, which will be much faster since it does not require scanning all the data.
191191

‎docs/content/querying/filters.md

+30
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,33 @@ The following matches any dimension values for the dimension `name` between `'ba
8080
"function" : "function(x) { return(x >= 'bar' && x <= 'foo') }"
8181
}
8282
```
83+
84+
### Extraction filter
85+
86+
Extraction filter matches a dimension using some specific [Extraction function](./dimensionspecs.html#extraction-functions).
87+
The following filter matches the values for which the extraction function has transformation entry `input_key=output_value` where
88+
`output_value` is equal to the filter `value` and `input_key` is present as dimension.
89+
90+
**Example**
91+
The following matches dimension values in `[product_1, product_3, product_5]` for the column `product`
92+
93+
```json
94+
{
95+
"filter": {
96+
"type": "extraction",
97+
"dimension": "product",
98+
"value": "bar_1",
99+
"extractionFn": {
100+
"type": "lookup",
101+
"lookup": {
102+
"type": "map",
103+
"map": {
104+
"product_1": "bar_1",
105+
"product_5": "bar_1",
106+
"product_3": "bar_1"
107+
}
108+
}
109+
}
110+
}
111+
}
112+
```

‎extensions/kafka-extraction-namespace/src/main/java/io/druid/server/namespace/KafkaExtractionNamespaceFactory.java

-3
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,10 @@
2424
import com.google.inject.Inject;
2525
import io.druid.query.extraction.namespace.ExtractionNamespaceFunctionFactory;
2626
import io.druid.query.extraction.namespace.KafkaExtractionNamespace;
27-
import io.druid.query.extraction.namespace.URIExtractionNamespace;
28-
import io.druid.server.namespace.cache.NamespaceExtractionCacheManager;
2927

3028
import javax.annotation.Nullable;
3129
import java.util.Map;
3230
import java.util.concurrent.Callable;
33-
import java.util.concurrent.ConcurrentMap;
3431

3532
/**
3633
*

‎extensions/kafka-extraction-namespace/src/main/java/io/druid/server/namespace/KafkaExtractionNamespaceModule.java

-7
Original file line numberDiff line numberDiff line change
@@ -19,27 +19,20 @@
1919

2020
package io.druid.server.namespace;
2121

22-
import com.fasterxml.jackson.core.Version;
2322
import com.fasterxml.jackson.core.type.TypeReference;
2423
import com.fasterxml.jackson.databind.Module;
2524
import com.fasterxml.jackson.databind.ObjectMapper;
2625
import com.fasterxml.jackson.databind.module.SimpleModule;
2726
import com.google.common.base.Throwables;
2827
import com.google.common.collect.ImmutableList;
2928
import com.google.inject.Binder;
30-
import com.google.inject.Injector;
3129
import com.google.inject.Provides;
32-
import com.google.inject.TypeLiteral;
33-
import com.google.inject.multibindings.MapBinder;
3430
import com.google.inject.name.Named;
3531
import io.druid.guice.LazySingleton;
3632
import io.druid.guice.LifecycleModule;
3733
import io.druid.guice.annotations.Json;
3834
import io.druid.initialization.DruidModule;
39-
import io.druid.query.extraction.namespace.ExtractionNamespace;
40-
import io.druid.query.extraction.namespace.ExtractionNamespaceFunctionFactory;
4135
import io.druid.query.extraction.namespace.KafkaExtractionNamespace;
42-
import io.druid.server.namespace.cache.NamespaceExtractionCacheManager;
4336

4437
import java.io.IOException;
4538
import java.util.List;

‎processing/src/main/java/io/druid/query/filter/ExtractionDimFilter.java

-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ public ExtractionDimFilter(
4343
)
4444
{
4545
Preconditions.checkArgument(dimension != null, "dimension must not be null");
46-
Preconditions.checkArgument(value != null, "value must not be null");
4746
Preconditions.checkArgument(extractionFn != null || dimExtractionFn != null, "extraction function must not be null");
4847

4948
this.dimension = dimension;

‎processing/src/main/java/io/druid/segment/column/SimpleDictionaryEncodedColumn.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package io.druid.segment.column;
1919

20+
import com.google.common.base.Strings;
2021
import com.metamx.common.guava.CloseQuietly;
2122
import io.druid.segment.data.CachingIndexed;
2223
import io.druid.segment.data.IndexedInts;
@@ -71,7 +72,8 @@ public IndexedInts getMultiValueRow(int rowNum)
7172
@Override
7273
public String lookupName(int id)
7374
{
74-
return cachedLookups.get(id);
75+
//Empty to Null will ensure that null and empty are equivalent for extraction function
76+
return Strings.emptyToNull(cachedLookups.get(id));
7577
}
7678

7779
@Override

‎processing/src/main/java/io/druid/segment/filter/ExtractionFilter.java

+79-17
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,22 @@
1717

1818
package io.druid.segment.filter;
1919

20+
import com.google.common.base.Predicate;
21+
import com.google.common.base.Strings;
2022
import com.google.common.collect.Lists;
2123
import com.metamx.collections.bitmap.ImmutableBitmap;
22-
import com.metamx.collections.bitmap.WrappedImmutableConciseBitmap;
2324
import io.druid.query.extraction.ExtractionFn;
2425
import io.druid.query.filter.BitmapIndexSelector;
2526
import io.druid.query.filter.Filter;
2627
import io.druid.query.filter.ValueMatcher;
2728
import io.druid.query.filter.ValueMatcherFactory;
2829
import io.druid.segment.ColumnSelectorFactory;
30+
import io.druid.segment.DimensionSelector;
2931
import io.druid.segment.data.Indexed;
30-
import it.uniroma3.mat.extendedset.intset.ImmutableConciseSet;
32+
import io.druid.segment.data.IndexedInts;
3133

34+
import java.util.BitSet;
35+
import java.util.Iterator;
3236
import java.util.List;
3337

3438
/**
@@ -39,27 +43,50 @@ public class ExtractionFilter implements Filter
3943
private final String value;
4044
private final ExtractionFn fn;
4145

42-
public ExtractionFilter(
43-
String dimension,
44-
String value,
45-
ExtractionFn fn
46-
)
46+
public ExtractionFilter(String dimension, String value, ExtractionFn fn)
4747
{
4848
this.dimension = dimension;
49-
this.value = value;
49+
this.value = Strings.nullToEmpty(value);
5050
this.fn = fn;
5151
}
5252

5353
private List<Filter> makeFilters(BitmapIndexSelector selector)
5454
{
55-
final Indexed<String> allDimVals = selector.getDimensionValues(dimension);
55+
Indexed<String> allDimVals = selector.getDimensionValues(dimension);
5656
final List<Filter> filters = Lists.newArrayList();
57-
if (allDimVals != null) {
58-
for (int i = 0; i < allDimVals.size(); i++) {
59-
String dimVal = allDimVals.get(i);
60-
if (value.equals(fn.apply(dimVal))) {
61-
filters.add(new SelectorFilter(dimension, dimVal));
57+
if (allDimVals == null) {
58+
allDimVals = new Indexed<String>()
59+
{
60+
@Override
61+
public Iterator<String> iterator()
62+
{
63+
return null;
6264
}
65+
66+
@Override
67+
public Class<? extends String> getClazz()
68+
{
69+
return null;
70+
}
71+
72+
@Override
73+
public int size() { return 1; }
74+
75+
@Override
76+
public String get(int index) { return null;}
77+
78+
@Override
79+
public int indexOf(String value)
80+
{
81+
return 0;
82+
}
83+
};
84+
}
85+
86+
for (int i = 0; i < allDimVals.size(); i++) {
87+
String dimVal = allDimVals.get(i);
88+
if (value.equals(Strings.nullToEmpty(fn.apply(dimVal)))) {
89+
filters.add(new SelectorFilter(dimension, dimVal));
6390
}
6491
}
6592

@@ -79,13 +106,48 @@ public ImmutableBitmap getBitmapIndex(BitmapIndexSelector selector)
79106
@Override
80107
public ValueMatcher makeMatcher(ValueMatcherFactory factory)
81108
{
82-
throw new UnsupportedOperationException();
109+
return factory.makeValueMatcher(
110+
dimension, new Predicate<String>()
111+
{
112+
@Override
113+
public boolean apply(String input)
114+
{
115+
// Assuming that a null/absent/empty dimension are equivalent from the druid perspective
116+
return value.equals(Strings.nullToEmpty(fn.apply(Strings.emptyToNull(input))));
117+
}
118+
}
119+
);
83120
}
84121

85122
@Override
86-
public ValueMatcher makeMatcher(ColumnSelectorFactory factory)
123+
public ValueMatcher makeMatcher(ColumnSelectorFactory columnSelectorFactory)
87124
{
88-
throw new UnsupportedOperationException();
125+
final DimensionSelector dimensionSelector = columnSelectorFactory.makeDimensionSelector(dimension, null);
126+
if (dimensionSelector == null) {
127+
return new BooleanValueMatcher(value.equals(Strings.nullToEmpty(fn.apply(null))));
128+
} else {
129+
final BitSet bitSetOfIds = new BitSet(dimensionSelector.getValueCardinality());
130+
for (int i = 0; i < dimensionSelector.getValueCardinality(); i++) {
131+
if (value.equals(Strings.nullToEmpty(fn.apply(dimensionSelector.lookupName(i))))) {
132+
bitSetOfIds.set(i);
133+
}
134+
}
135+
return new ValueMatcher()
136+
{
137+
@Override
138+
public boolean matches()
139+
{
140+
final IndexedInts row = dimensionSelector.getRow();
141+
final int size = row.size();
142+
for (int i = 0; i < size; ++i) {
143+
if (bitSetOfIds.get(row.get(i))) {
144+
return true;
145+
}
146+
}
147+
return false;
148+
}
149+
};
150+
}
89151
}
90152

91153
}

‎processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java

+269
Large diffs are not rendered by default.

‎processing/src/test/java/io/druid/query/topn/TopNQueryRunnerTest.java

+108-5
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import io.druid.query.aggregation.AggregatorFactory;
4242
import io.druid.query.aggregation.DoubleMaxAggregatorFactory;
4343
import io.druid.query.aggregation.DoubleMinAggregatorFactory;
44+
import io.druid.query.aggregation.FilteredAggregatorFactory;
4445
import io.druid.query.aggregation.PostAggregator;
4546
import io.druid.query.aggregation.cardinality.CardinalityAggregatorFactory;
4647
import io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory;
@@ -54,6 +55,7 @@
5455
import io.druid.query.extraction.TimeFormatExtractionFn;
5556
import io.druid.query.filter.AndDimFilter;
5657
import io.druid.query.filter.DimFilter;
58+
import io.druid.query.filter.ExtractionDimFilter;
5759
import io.druid.query.filter.SelectorDimFilter;
5860
import io.druid.query.spec.MultipleIntervalSegmentSpec;
5961
import io.druid.query.timeseries.TimeseriesQuery;
@@ -158,9 +160,7 @@ private Sequence<Result<TopNResultValue>> runWithMerge(
158160
QueryRunnerTestHelper.NoopIntervalChunkingQueryRunnerDecorator()
159161
);
160162
final QueryRunner<Result<TopNResultValue>> mergeRunner = chest.mergeResults(runner);
161-
return mergeRunner.run(
162-
query, context
163-
);
163+
return mergeRunner.run(query, context);
164164
}
165165

166166
@Test
@@ -1611,7 +1611,6 @@ public void testTopNDimExtraction()
16111611
}
16121612

16131613

1614-
16151614
@Test
16161615
public void testTopNDimExtractionFastTopNOptimalWithReplaceMissing()
16171616
{
@@ -3119,8 +3118,10 @@ public void testTopNOverPartialNullDimensionWithFilterOnNOTNullValue()
31193118
);
31203119
assertExpectedResults(expectedResults, query);
31213120
}
3121+
31223122
@Test
3123-
public void testAlphaNumericTopNWithNullPreviousStop(){
3123+
public void testAlphaNumericTopNWithNullPreviousStop()
3124+
{
31243125
TopNQuery query = new TopNQueryBuilder()
31253126
.dataSource(QueryRunnerTestHelper.dataSource)
31263127
.granularity(QueryGranularity.ALL)
@@ -3149,4 +3150,106 @@ public void testAlphaNumericTopNWithNullPreviousStop(){
31493150
);
31503151
TestHelper.assertExpectedResults(expectedResults, runner.run(query, new HashMap<String, Object>()));
31513152
}
3153+
3154+
@Test
3155+
public void testTopNWithExtractionFilter()
3156+
{
3157+
Map<String, String> extractionMap = new HashMap<>();
3158+
extractionMap.put("spot", "spot0");
3159+
MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap);
3160+
LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true);
3161+
3162+
TopNQuery query = new TopNQueryBuilder().dataSource(QueryRunnerTestHelper.dataSource)
3163+
.granularity(QueryRunnerTestHelper.allGran)
3164+
.dimension(QueryRunnerTestHelper.marketDimension)
3165+
.metric("rows")
3166+
.threshold(3)
3167+
.intervals(QueryRunnerTestHelper.firstToThird)
3168+
.aggregators(QueryRunnerTestHelper.commonAggregators)
3169+
.postAggregators(Arrays.<PostAggregator>asList(QueryRunnerTestHelper.addRowsIndexConstant))
3170+
.filters(
3171+
new ExtractionDimFilter(
3172+
QueryRunnerTestHelper.marketDimension,
3173+
"spot0",
3174+
lookupExtractionFn,
3175+
null
3176+
)
3177+
)
3178+
.build();
3179+
3180+
List<Result<TopNResultValue>> expectedResults = Arrays.asList(
3181+
new Result<>(
3182+
new DateTime("2011-04-01T00:00:00.000Z"),
3183+
new TopNResultValue(
3184+
Arrays.<Map<String, Object>>asList(
3185+
ImmutableMap.<String, Object>of(
3186+
QueryRunnerTestHelper.marketDimension, "spot",
3187+
"rows", 18L,
3188+
"index", 2231.8768157958984D,
3189+
"addRowsIndexConstant", 2250.8768157958984D,
3190+
"uniques", QueryRunnerTestHelper.UNIQUES_9
3191+
)
3192+
)
3193+
)
3194+
)
3195+
);
3196+
3197+
assertExpectedResults(expectedResults, query);
3198+
}
3199+
3200+
@Test
3201+
public void testTopNWithExtractionFilterAndFilteredAggregatorCaseNoExistingValue()
3202+
{
3203+
Map<String, String> extractionMap = new HashMap<>();
3204+
extractionMap.put("", "NULL");
3205+
3206+
MapLookupExtractor mapLookupExtractor = new MapLookupExtractor(extractionMap);
3207+
LookupExtractionFn lookupExtractionFn = new LookupExtractionFn(mapLookupExtractor, false, null, true);
3208+
DimFilter extractionFilter = new ExtractionDimFilter("null_column", "NULL", lookupExtractionFn, null);
3209+
TopNQueryBuilder topNQueryBuilder = new TopNQueryBuilder()
3210+
.dataSource(QueryRunnerTestHelper.dataSource)
3211+
.granularity(QueryRunnerTestHelper.allGran)
3212+
.dimension("null_column")
3213+
.metric(QueryRunnerTestHelper.indexMetric)
3214+
.threshold(4)
3215+
.intervals(QueryRunnerTestHelper.fullOnInterval)
3216+
.aggregators(
3217+
Lists.newArrayList(
3218+
Iterables.concat(
3219+
QueryRunnerTestHelper.commonAggregators, Lists.newArrayList(
3220+
new FilteredAggregatorFactory(
3221+
new DoubleMaxAggregatorFactory("maxIndex", "index"),
3222+
extractionFilter
3223+
),
3224+
new DoubleMinAggregatorFactory("minIndex", "index")
3225+
)
3226+
)
3227+
)
3228+
)
3229+
.postAggregators(Arrays.<PostAggregator>asList(QueryRunnerTestHelper.addRowsIndexConstant));
3230+
TopNQuery topNQueryWithNULLValueExtraction = topNQueryBuilder
3231+
.filters(extractionFilter)
3232+
.build();
3233+
3234+
Map<String, Object> map = Maps.newHashMap();
3235+
map.put("null_column", null);
3236+
map.put("rows", 1209L);
3237+
map.put("index", 503332.5071372986D);
3238+
map.put("addRowsIndexConstant", 504542.5071372986D);
3239+
map.put("uniques", QueryRunnerTestHelper.UNIQUES_9);
3240+
map.put("maxIndex", 1870.06103515625D);
3241+
map.put("minIndex", 59.02102279663086D);
3242+
List<Result<TopNResultValue>> expectedResults = Arrays.asList(
3243+
new Result<>(
3244+
new DateTime("2011-01-12T00:00:00.000Z"),
3245+
new TopNResultValue(
3246+
Arrays.asList(
3247+
map
3248+
)
3249+
)
3250+
)
3251+
);
3252+
assertExpectedResults(expectedResults, topNQueryWithNULLValueExtraction);
3253+
}
3254+
31523255
}

0 commit comments

Comments
 (0)
Please sign in to comment.