Skip to content

Commit 922b399

Browse files
committed
Revert "Enable reading StringViewArray by default from Parquet (8% improvement for entire ClickBench suite) (apache#13101)"
This reverts commit 2d7892b.
1 parent 50e1209 commit 922b399

File tree

14 files changed

+65
-47
lines changed

14 files changed

+65
-47
lines changed

benchmarks/src/bin/external_aggr.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,12 @@ impl ExternalAggrConfig {
193193
) -> Result<Vec<QueryResult>> {
194194
let query_name =
195195
format!("Q{query_id}({})", human_readable_size(mem_limit as usize));
196-
let config = self.common.config();
196+
let mut config = self.common.config();
197+
config
198+
.options_mut()
199+
.execution
200+
.parquet
201+
.schema_force_view_types = self.common.force_view_types;
197202
let runtime_config = RuntimeConfig::new()
198203
.with_memory_pool(Arc::new(FairSpillPool::new(mem_limit as usize)))
199204
.build_arc()?;

benchmarks/src/clickbench.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ impl RunOpt {
119119
let mut config = self.common.config();
120120
{
121121
let parquet_options = &mut config.options_mut().execution.parquet;
122+
parquet_options.schema_force_view_types = self.common.force_view_types;
122123
// The hits_partitioned dataset specifies string columns
123124
// as binary due to how it was written. Force it to strings
124125
parquet_options.binary_as_string = true;

benchmarks/src/imdb/run.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,11 @@ impl RunOpt {
305305
.config()
306306
.with_collect_statistics(!self.disable_statistics);
307307
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
308-
308+
config
309+
.options_mut()
310+
.execution
311+
.parquet
312+
.schema_force_view_types = self.common.force_view_types;
309313
let ctx = SessionContext::new_with_config(config);
310314

311315
// register tables
@@ -513,6 +517,7 @@ mod tests {
513517
partitions: Some(2),
514518
batch_size: 8192,
515519
debug: false,
520+
force_view_types: false,
516521
};
517522
let opt = RunOpt {
518523
query: Some(query),
@@ -546,6 +551,7 @@ mod tests {
546551
partitions: Some(2),
547552
batch_size: 8192,
548553
debug: false,
554+
force_view_types: false,
549555
};
550556
let opt = RunOpt {
551557
query: Some(query),

benchmarks/src/tpch/run.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,11 @@ impl RunOpt {
120120
.config()
121121
.with_collect_statistics(!self.disable_statistics);
122122
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
123+
config
124+
.options_mut()
125+
.execution
126+
.parquet
127+
.schema_force_view_types = self.common.force_view_types;
123128
let ctx = SessionContext::new_with_config(config);
124129

125130
// register tables
@@ -340,6 +345,7 @@ mod tests {
340345
partitions: Some(2),
341346
batch_size: 8192,
342347
debug: false,
348+
force_view_types: false,
343349
};
344350
let opt = RunOpt {
345351
query: Some(query),
@@ -373,6 +379,7 @@ mod tests {
373379
partitions: Some(2),
374380
batch_size: 8192,
375381
debug: false,
382+
force_view_types: false,
376383
};
377384
let opt = RunOpt {
378385
query: Some(query),

benchmarks/src/util/options.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ pub struct CommonOpt {
3737
/// Activate debug mode to see more details
3838
#[structopt(short, long)]
3939
pub debug: bool,
40+
41+
/// If true, will use StringView/BinaryViewArray instead of String/BinaryArray
42+
/// when reading ParquetFiles
43+
#[structopt(long)]
44+
pub force_view_types: bool,
4045
}
4146

4247
impl CommonOpt {

datafusion/common/src/config.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ config_namespace! {
399399

400400
/// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
401401
/// and `Binary/BinaryLarge` with `BinaryView`.
402-
pub schema_force_view_types: bool, default = true
402+
pub schema_force_view_types: bool, default = false
403403

404404
/// (reading) If true, parquet reader will read columns of
405405
/// `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`.

datafusion/common/src/scalar/mod.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -980,11 +980,6 @@ impl ScalarValue {
980980
ScalarValue::from(val.into())
981981
}
982982

983-
/// Returns a [`ScalarValue::Utf8View`] representing `val`
984-
pub fn new_utf8view(val: impl Into<String>) -> Self {
985-
ScalarValue::Utf8View(Some(val.into()))
986-
}
987-
988983
/// Returns a [`ScalarValue::IntervalYearMonth`] representing
989984
/// `years` years and `months` months
990985
pub fn new_interval_ym(years: i32, months: i32) -> Self {

datafusion/core/tests/parquet/page_pruning.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,8 @@ async fn page_index_filter_one_col() {
149149
let session_ctx = SessionContext::new();
150150
let task_ctx = session_ctx.task_ctx();
151151

152-
// 5.create filter date_string_col == "01/01/09"`;
153-
// Note this test doesn't apply type coercion so the literal must match the actual view type
154-
let filter = col("date_string_col").eq(lit(ScalarValue::new_utf8view("01/01/09")));
152+
// 5.create filter date_string_col == 1;
153+
let filter = col("date_string_col").eq(lit("01/01/09"));
155154
let parquet_exec = get_parquet_exec(&state, filter).await;
156155
let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
157156
let batch = results.next().await.unwrap().unwrap();

datafusion/sqllogictest/test_files/describe.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ int_col Int32 YES
8181
bigint_col Int64 YES
8282
float_col Float32 YES
8383
double_col Float64 YES
84-
date_string_col Utf8View YES
85-
string_col Utf8View YES
84+
date_string_col Utf8 YES
85+
string_col Utf8 YES
8686
timestamp_col Timestamp(Nanosecond, None) YES
8787
year Int32 YES
8888
month Int32 YES

datafusion/sqllogictest/test_files/explain.slt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,8 @@ initial_physical_plan
305305
01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
306306
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
307307
initial_physical_plan_with_schema
308-
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
309-
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
308+
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
309+
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
310310
physical_plan after OutputRequirements
311311
01)OutputRequirementExec, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
312312
02)--GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
@@ -328,7 +328,7 @@ physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
328328
physical_plan after LimitPushdown ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
329329
physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
330330
physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
331-
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
331+
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
332332

333333

334334
statement ok
@@ -345,8 +345,8 @@ initial_physical_plan_with_stats
345345
01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
346346
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
347347
initial_physical_plan_with_schema
348-
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
349-
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
348+
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
349+
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
350350
physical_plan after OutputRequirements
351351
01)OutputRequirementExec
352352
02)--GlobalLimitExec: skip=0, fetch=10
@@ -369,7 +369,7 @@ physical_plan after LimitPushdown ParquetExec: file_groups={1 group: [[WORKSPACE
369369
physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
370370
physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10
371371
physical_plan_with_stats ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
372-
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
372+
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
373373

374374

375375
statement ok

0 commit comments

Comments
 (0)