Skip to content

Commit a6d5f8a

Browse files
Merge commit '47026a2a3dd41a5c87e44ade58d91a89feba147b' into chunchun/update-df-june-week-2-2
2 parents fceab3c + 47026a2 commit a6d5f8a

File tree

145 files changed

+4866
-3242
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

145 files changed

+4866
-3242
lines changed

datafusion-cli/Cargo.lock

Lines changed: 10 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion-examples/examples/csv_opener.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ async fn main() -> Result<()> {
4848
b',',
4949
b'"',
5050
object_store,
51+
Some(b'#'),
5152
);
5253

5354
let opener = CsvOpener::new(Arc::new(config), FileCompressionType::UNCOMPRESSED);

datafusion-examples/examples/expr_api.rs

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use arrow::record_batch::RecordBatch;
2424
use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
2525
use datafusion::common::DFSchema;
2626
use datafusion::error::Result;
27+
use datafusion::functions_aggregate::first_last::first_value_udaf;
2728
use datafusion::optimizer::simplify_expressions::ExprSimplifier;
2829
use datafusion::physical_expr::{analyze, AnalysisContext, ExprBoundaries};
2930
use datafusion::prelude::*;
@@ -32,7 +33,7 @@ use datafusion_expr::execution_props::ExecutionProps;
3233
use datafusion_expr::expr::BinaryExpr;
3334
use datafusion_expr::interval_arithmetic::Interval;
3435
use datafusion_expr::simplify::SimplifyContext;
35-
use datafusion_expr::{ColumnarValue, ExprSchemable, Operator};
36+
use datafusion_expr::{AggregateExt, ColumnarValue, ExprSchemable, Operator};
3637

3738
/// This example demonstrates the DataFusion [`Expr`] API.
3839
///
@@ -44,11 +45,12 @@ use datafusion_expr::{ColumnarValue, ExprSchemable, Operator};
4445
/// also comes with APIs for evaluation, simplification, and analysis.
4546
///
4647
/// The code in this example shows how to:
47-
/// 1. Create [`Exprs`] using different APIs: [`main`]`
48-
/// 2. Evaluate [`Exprs`] against data: [`evaluate_demo`]
49-
/// 3. Simplify expressions: [`simplify_demo`]
50-
/// 4. Analyze predicates for boundary ranges: [`range_analysis_demo`]
51-
/// 5. Get the types of the expressions: [`expression_type_demo`]
48+
/// 1. Create [`Expr`]s using different APIs: [`main`]`
49+
/// 2. Use the fluent API to easly create complex [`Expr`]s: [`expr_fn_demo`]
50+
/// 3. Evaluate [`Expr`]s against data: [`evaluate_demo`]
51+
/// 4. Simplify expressions: [`simplify_demo`]
52+
/// 5. Analyze predicates for boundary ranges: [`range_analysis_demo`]
53+
/// 6. Get the types of the expressions: [`expression_type_demo`]
5254
#[tokio::main]
5355
async fn main() -> Result<()> {
5456
// The easiest way to do create expressions is to use the
@@ -63,6 +65,9 @@ async fn main() -> Result<()> {
6365
));
6466
assert_eq!(expr, expr2);
6567

68+
// See how to build aggregate functions with the expr_fn API
69+
expr_fn_demo()?;
70+
6671
// See how to evaluate expressions
6772
evaluate_demo()?;
6873

@@ -78,6 +83,33 @@ async fn main() -> Result<()> {
7883
Ok(())
7984
}
8085

86+
/// Datafusion's `expr_fn` API makes it easy to create [`Expr`]s for the
87+
/// full range of expression types such as aggregates and window functions.
88+
fn expr_fn_demo() -> Result<()> {
89+
// Let's say you want to call the "first_value" aggregate function
90+
let first_value = first_value_udaf();
91+
92+
// For example, to create the expression `FIRST_VALUE(price)`
93+
// These expressions can be passed to `DataFrame::aggregate` and other
94+
// APIs that take aggregate expressions.
95+
let agg = first_value.call(vec![col("price")]);
96+
assert_eq!(agg.to_string(), "first_value(price)");
97+
98+
// You can use the AggregateExt trait to create more complex aggregates
99+
// such as `FIRST_VALUE(price FILTER quantity > 100 ORDER BY ts )
100+
let agg = first_value
101+
.call(vec![col("price")])
102+
.order_by(vec![col("ts").sort(false, false)])
103+
.filter(col("quantity").gt(lit(100)))
104+
.build()?; // build the aggregate
105+
assert_eq!(
106+
agg.to_string(),
107+
"first_value(price) FILTER (WHERE quantity > Int32(100)) ORDER BY [ts DESC NULLS LAST]"
108+
);
109+
110+
Ok(())
111+
}
112+
81113
/// DataFusion can also evaluate arbitrary expressions on Arrow arrays.
82114
fn evaluate_demo() -> Result<()> {
83115
// For example, let's say you have some integers in an array

datafusion-examples/examples/parquet_index.rs

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use arrow_schema::SchemaRef;
2525
use async_trait::async_trait;
2626
use datafusion::datasource::listing::PartitionedFile;
2727
use datafusion::datasource::physical_plan::{
28-
parquet::{RequestedStatistics, StatisticsConverter},
28+
parquet::StatisticsConverter,
2929
{FileScanConfig, ParquetExec},
3030
};
3131
use datafusion::datasource::TableProvider;
@@ -518,21 +518,17 @@ impl ParquetMetadataIndexBuilder {
518518

519519
// extract the parquet statistics from the file's footer
520520
let metadata = reader.metadata();
521+
let row_groups = metadata.row_groups();
521522

522523
// Extract the min/max values for each row group from the statistics
523-
let row_counts = StatisticsConverter::row_counts(reader.metadata())?;
524-
let value_column_mins = StatisticsConverter::try_new(
524+
let converter = StatisticsConverter::try_new(
525525
"value",
526-
RequestedStatistics::Min,
527526
reader.schema(),
528-
)?
529-
.extract(reader.metadata())?;
530-
let value_column_maxes = StatisticsConverter::try_new(
531-
"value",
532-
RequestedStatistics::Max,
533-
reader.schema(),
534-
)?
535-
.extract(reader.metadata())?;
527+
reader.parquet_schema(),
528+
)?;
529+
let row_counts = StatisticsConverter::row_group_row_counts(row_groups.iter())?;
530+
let value_column_mins = converter.row_group_mins(row_groups.iter())?;
531+
let value_column_maxes = converter.row_group_maxes(row_groups.iter())?;
536532

537533
// In a real system you would have to handle nulls, which represent
538534
// unknown statistics. All statistics are known in this example

datafusion/common/src/config.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1567,6 +1567,7 @@ config_namespace! {
15671567
pub timestamp_tz_format: Option<String>, default = None
15681568
pub time_format: Option<String>, default = None
15691569
pub null_value: Option<String>, default = None
1570+
pub comment: Option<u8>, default = None
15701571
}
15711572
}
15721573

datafusion/core/benches/parquet_statistic.rs

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,7 @@ use arrow_schema::{
2424
Field, Schema,
2525
};
2626
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
27-
use datafusion::datasource::physical_plan::parquet::{
28-
RequestedStatistics, StatisticsConverter,
29-
};
27+
use datafusion::datasource::physical_plan::parquet::StatisticsConverter;
3028
use parquet::arrow::{arrow_reader::ArrowReaderBuilder, ArrowWriter};
3129
use parquet::file::properties::WriterProperties;
3230
use std::sync::Arc;
@@ -159,41 +157,26 @@ fn criterion_benchmark(c: &mut Criterion) {
159157
let file = file.reopen().unwrap();
160158
let reader = ArrowReaderBuilder::try_new(file).unwrap();
161159
let metadata = reader.metadata();
160+
let row_groups = metadata.row_groups();
162161

163162
let mut group =
164163
c.benchmark_group(format!("Extract statistics for {}", dtype.clone()));
165164
group.bench_function(
166165
BenchmarkId::new("extract_statistics", dtype.clone()),
167166
|b| {
168167
b.iter(|| {
169-
let _ = StatisticsConverter::try_new(
170-
"col",
171-
RequestedStatistics::Min,
172-
reader.schema(),
173-
)
174-
.unwrap()
175-
.extract(metadata)
176-
.unwrap();
177-
178-
let _ = StatisticsConverter::try_new(
179-
"col",
180-
RequestedStatistics::Max,
181-
reader.schema(),
182-
)
183-
.unwrap()
184-
.extract(reader.metadata())
185-
.unwrap();
186-
187-
let _ = StatisticsConverter::try_new(
168+
let converter = StatisticsConverter::try_new(
188169
"col",
189-
RequestedStatistics::NullCount,
190170
reader.schema(),
171+
reader.parquet_schema(),
191172
)
192-
.unwrap()
193-
.extract(reader.metadata())
194173
.unwrap();
195174

196-
let _ = StatisticsConverter::row_counts(reader.metadata()).unwrap();
175+
let _ = converter.row_group_mins(row_groups.iter()).unwrap();
176+
let _ = converter.row_group_maxes(row_groups.iter()).unwrap();
177+
let _ = converter.row_group_null_counts(row_groups.iter()).unwrap();
178+
let _ = StatisticsConverter::row_group_row_counts(row_groups.iter())
179+
.unwrap();
197180
})
198181
},
199182
);

datafusion/core/src/dataframe/mod.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,12 @@ use datafusion_common::{
5050
};
5151
use datafusion_expr::lit;
5252
use datafusion_expr::{
53-
avg, count, max, min, stddev, utils::COUNT_STAR_EXPANSION,
54-
TableProviderFilterPushDown, UNNAMED_TABLE,
53+
avg, count, max, min, utils::COUNT_STAR_EXPANSION, TableProviderFilterPushDown,
54+
UNNAMED_TABLE,
5555
};
5656
use datafusion_expr::{case, is_null};
57-
use datafusion_functions_aggregate::expr_fn::median;
5857
use datafusion_functions_aggregate::expr_fn::sum;
58+
use datafusion_functions_aggregate::expr_fn::{median, stddev};
5959

6060
use async_trait::async_trait;
6161

@@ -1820,7 +1820,7 @@ mod tests {
18201820

18211821
assert_batches_sorted_eq!(
18221822
["+----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+",
1823-
"| c1 | MIN(aggregate_test_100.c12) | MAX(aggregate_test_100.c12) | AVG(aggregate_test_100.c12) | SUM(aggregate_test_100.c12) | COUNT(aggregate_test_100.c12) | COUNT(DISTINCT aggregate_test_100.c12) |",
1823+
"| c1 | MIN(aggregate_test_100.c12) | MAX(aggregate_test_100.c12) | AVG(aggregate_test_100.c12) | sum(aggregate_test_100.c12) | COUNT(aggregate_test_100.c12) | COUNT(DISTINCT aggregate_test_100.c12) |",
18241824
"+----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+",
18251825
"| a | 0.02182578039211991 | 0.9800193410444061 | 0.48754517466109415 | 10.238448667882977 | 21 | 21 |",
18261826
"| b | 0.04893135681998029 | 0.9185813970744787 | 0.41040709263815384 | 7.797734760124923 | 19 | 19 |",
@@ -2395,7 +2395,7 @@ mod tests {
23952395
assert_batches_sorted_eq!(
23962396
[
23972397
"+----+-----------------------------+",
2398-
"| c1 | SUM(aggregate_test_100.c12) |",
2398+
"| c1 | sum(aggregate_test_100.c12) |",
23992399
"+----+-----------------------------+",
24002400
"| a | 10.238448667882977 |",
24012401
"| b | 7.797734760124923 |",
@@ -2411,7 +2411,7 @@ mod tests {
24112411
assert_batches_sorted_eq!(
24122412
[
24132413
"+----+---------------------+",
2414-
"| c1 | SUM(test_table.c12) |",
2414+
"| c1 | sum(test_table.c12) |",
24152415
"+----+---------------------+",
24162416
"| a | 10.238448667882977 |",
24172417
"| b | 7.797734760124923 |",

datafusion/core/src/datasource/file_format/csv.rs

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,12 @@ impl CsvFormat {
147147
self.options.has_header
148148
}
149149

150+
/// Lines beginning with this byte are ignored.
151+
pub fn with_comment(mut self, comment: Option<u8>) -> Self {
152+
self.options.comment = comment;
153+
self
154+
}
155+
150156
/// The character separating values within a row.
151157
/// - default to ','
152158
pub fn with_delimiter(mut self, delimiter: u8) -> Self {
@@ -252,6 +258,7 @@ impl FileFormat for CsvFormat {
252258
self.options.delimiter,
253259
self.options.quote,
254260
self.options.escape,
261+
self.options.comment,
255262
self.options.compression.into(),
256263
);
257264
Ok(Arc::new(exec))
@@ -300,7 +307,7 @@ impl CsvFormat {
300307
pin_mut!(stream);
301308

302309
while let Some(chunk) = stream.next().await.transpose()? {
303-
let format = arrow::csv::reader::Format::default()
310+
let mut format = arrow::csv::reader::Format::default()
304311
.with_header(
305312
first_chunk
306313
&& self
@@ -310,6 +317,10 @@ impl CsvFormat {
310317
)
311318
.with_delimiter(self.options.delimiter);
312319

320+
if let Some(comment) = self.options.comment {
321+
format = format.with_comment(comment);
322+
}
323+
313324
let (Schema { fields, .. }, records_read) =
314325
format.infer_schema(chunk.reader(), Some(records_to_read))?;
315326

@@ -919,7 +930,7 @@ mod tests {
919930

920931
#[rustfmt::skip]
921932
let expected = ["+--------------+",
922-
"| SUM(aggr.c2) |",
933+
"| sum(aggr.c2) |",
923934
"+--------------+",
924935
"| 285 |",
925936
"+--------------+"];
@@ -956,7 +967,7 @@ mod tests {
956967

957968
#[rustfmt::skip]
958969
let expected = ["+--------------+",
959-
"| SUM(aggr.c3) |",
970+
"| sum(aggr.c3) |",
960971
"+--------------+",
961972
"| 781 |",
962973
"+--------------+"];
@@ -1122,7 +1133,7 @@ mod tests {
11221133

11231134
#[rustfmt::skip]
11241135
let expected = ["+---------------------+",
1125-
"| SUM(empty.column_1) |",
1136+
"| sum(empty.column_1) |",
11261137
"+---------------------+",
11271138
"| 10 |",
11281139
"+---------------------+"];
@@ -1161,7 +1172,7 @@ mod tests {
11611172

11621173
#[rustfmt::skip]
11631174
let expected = ["+-----------------------+",
1164-
"| SUM(one_col.column_1) |",
1175+
"| sum(one_col.column_1) |",
11651176
"+-----------------------+",
11661177
"| 50 |",
11671178
"+-----------------------+"];

datafusion/core/src/datasource/file_format/json.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -470,15 +470,15 @@ mod tests {
470470
ctx.register_json("json_parallel", table_path, options)
471471
.await?;
472472

473-
let query = "SELECT SUM(a) FROM json_parallel;";
473+
let query = "SELECT sum(a) FROM json_parallel;";
474474

475475
let result = ctx.sql(query).await?.collect().await?;
476476
let actual_partitions = count_num_partitions(&ctx, query).await?;
477477

478478
#[rustfmt::skip]
479479
let expected = [
480480
"+----------------------+",
481-
"| SUM(json_parallel.a) |",
481+
"| sum(json_parallel.a) |",
482482
"+----------------------+",
483483
"| -7 |",
484484
"+----------------------+"

0 commit comments

Comments
 (0)