Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented simplify for the starts_with function to convert it into a LIKE expression. #14119

Merged
merged 6 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 45 additions & 2 deletions datafusion/functions/src/string/starts_with.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ use std::sync::Arc;

use arrow::array::ArrayRef;
use arrow::datatypes::DataType;
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};

use crate::utils::make_scalar_function;
use datafusion_common::{internal_err, Result};
use datafusion_expr::{ColumnarValue, Documentation};
use datafusion_common::{internal_err, Result, ScalarValue};
use datafusion_expr::{ColumnarValue, Documentation, Expr, Like};
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use datafusion_macros::user_doc;

Expand Down Expand Up @@ -98,6 +99,48 @@ impl ScalarUDFImpl for StartsWithFunc {
}
}

fn simplify(
&self,
args: Vec<Expr>,
_info: &dyn SimplifyInfo,
) -> Result<ExprSimplifyResult> {
if let Expr::Literal(scalar_value) = &args[1] {
// Convert starts_with(col, 'prefix') to col LIKE 'prefix%' with proper escaping
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I double checked the escaping logic and I think this looks good to me.

// Example: starts_with(col, 'ja%') -> col LIKE 'ja\%%'
// 1. 'ja%' (input pattern)
// 2. 'ja\%' (escape special char '%')
// 3. 'ja\%%' (add suffix for starts_with)
let like_expr = match scalar_value {
ScalarValue::Utf8(Some(pattern)) => {
let escaped_pattern = pattern.replace("%", "\\%");
let like_pattern = format!("{}%", escaped_pattern);
Expr::Literal(ScalarValue::Utf8(Some(like_pattern)))
}
ScalarValue::LargeUtf8(Some(pattern)) => {
let escaped_pattern = pattern.replace("%", "\\%");
let like_pattern = format!("{}%", escaped_pattern);
Expr::Literal(ScalarValue::LargeUtf8(Some(like_pattern)))
}
ScalarValue::Utf8View(Some(pattern)) => {
let escaped_pattern = pattern.replace("%", "\\%");
let like_pattern = format!("{}%", escaped_pattern);
Expr::Literal(ScalarValue::Utf8View(Some(like_pattern)))
}
_ => return Ok(ExprSimplifyResult::Original(args)),
};

return Ok(ExprSimplifyResult::Simplified(Expr::Like(Like {
negated: false,
expr: Box::new(args[0].clone()),
pattern: Box::new(like_expr),
escape_char: None,
case_insensitive: false,
})));
}

Ok(ExprSimplifyResult::Original(args))
}

fn documentation(&self) -> Option<&Documentation> {
self.doc()
}
Expand Down
31 changes: 31 additions & 0 deletions datafusion/sqllogictest/test_files/parquet.slt
Original file line number Diff line number Diff line change
Expand Up @@ -598,3 +598,34 @@ drop table cpu;

statement ok
drop table cpu_parquet;

# Test for parquet predicate pruning with `starts_with` function
query I
copy (values ('foo'), ('bar'), ('baz')) TO 'test_files/scratch/parquet/foo.parquet'
----
3

statement ok
create external table foo
stored as parquet
location 'test_files/scratch/parquet/foo.parquet';


# Expect that the pruning predicate contain a comparison on the min/max value of `column1):
# column1_min@0 <= g AND f <= column1_max@1`
# (the starts_with function is not supported in the parquet predicate pruning but DataFusion rewrites
# it to a like which is then handled by the PruningPredicate)
query TT
explain select * from foo where starts_with(column1, 'f');
----
logical_plan
01)Filter: foo.column1 LIKE Utf8View("f%")
02)--TableScan: foo projection=[column1], partial_filters=[foo.column1 LIKE Utf8View("f%")]
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: column1@0 LIKE f%
03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
04)------ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/foo.parquet]]}, projection=[column1], predicate=column1@0 LIKE f%, pruning_predicate=column1_null_count@2 != column1_row_count@3 AND column1_min@0 <= g AND f <= column1_max@1, required_guarantees=[]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is so cool!


statement ok
drop table foo
46 changes: 44 additions & 2 deletions datafusion/sqllogictest/test_files/string/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -344,9 +344,51 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
01)Projection: test.column1_utf8view LIKE Utf8View("äöüß%") AS c1, CASE test.column1_utf8view IS NOT NULL WHEN Boolean(true) THEN Boolean(true) END AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is actually pretty cool -- it figured out that STARTS_WITH(column1_utf8view, '') as c2, is true if column1_utf8view is NOT NULL

02)--TableScan: test projection=[column1_utf8view]

## Test STARTS_WITH is rewitten to LIKE when the pattern is a constant
query TT
EXPLAIN SELECT
STARTS_WITH(column1_utf8, 'foo%') as c1,
STARTS_WITH(column1_large_utf8, 'foo%') as c2,
STARTS_WITH(column1_utf8view, 'foo%') as c3,
STARTS_WITH(column1_utf8, 'f_o') as c4,
STARTS_WITH(column1_large_utf8, 'f_o') as c5,
STARTS_WITH(column1_utf8view, 'f_o') as c6
FROM test;
----
logical_plan
01)Projection: test.column1_utf8 LIKE Utf8("foo\%%") AS c1, test.column1_large_utf8 LIKE LargeUtf8("foo\%%") AS c2, test.column1_utf8view LIKE Utf8View("foo\%%") AS c3, test.column1_utf8 LIKE Utf8("f_o%") AS c4, test.column1_large_utf8 LIKE LargeUtf8("f_o%") AS c5, test.column1_utf8view LIKE Utf8View("f_o%") AS c6
02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column1_utf8view]

## Test STARTS_WITH works with column arguments
query TT
EXPLAIN SELECT
STARTS_WITH(column1_utf8, substr(column1_utf8, 1, 2)) as c1,
STARTS_WITH(column1_large_utf8, substr(column1_large_utf8, 1, 2)) as c2,
STARTS_WITH(column1_utf8view, substr(column1_utf8view, 1, 2)) as c3
FROM test;
----
logical_plan
01)Projection: starts_with(test.column1_utf8, substr(test.column1_utf8, Int64(1), Int64(2))) AS c1, starts_with(test.column1_large_utf8, substr(test.column1_large_utf8, Int64(1), Int64(2))) AS c2, starts_with(test.column1_utf8view, substr(test.column1_utf8view, Int64(1), Int64(2))) AS c3
02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column1_utf8view]

query BBB
SELECT
STARTS_WITH(column1_utf8, substr(column1_utf8, 1, 2)) as c1,
STARTS_WITH(column1_large_utf8, substr(column1_large_utf8, 1, 2)) as c2,
STARTS_WITH(column1_utf8view, substr(column1_utf8view, 1, 2)) as c3
FROM test;
----
true true true
true true true
true true true
true true true
NULL NULL NULL


# Ensure that INIT cap works with utf8view
query TT
EXPLAIN SELECT
INITCAP(column1_utf8view) as c
Expand Down Expand Up @@ -887,7 +929,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2
01)Projection: test.column1_utf8view LIKE Utf8View("foo%") AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2
02)--TableScan: test projection=[column1_utf8view, column2_utf8view]

## Ensure no casts for TRANSLATE
Expand Down
Loading