Skip to content

Commit 56dfd25

Browse files
authored
chore: update datafusion and related crates (#1504)
# Description Updating datafusion and related crates to latest version. With the updated object store, we unfortunately loose support for `aws-profile`. Since object sore now also contains logic for parsing urls, that we currently maintain here, I was planning on adopting these new APIs and recovering profile support in a follow up PR. This will then also remove the ignored deprecations from this PR.
1 parent 6650bd2 commit 56dfd25

File tree

19 files changed

+123
-100
lines changed

19 files changed

+123
-100
lines changed

docker-compose.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@ services:
1818
test: [ "CMD", "curl", "-f", "http://localhost:4566/health" ]
1919

2020
fake-gcs:
21-
image: fsouza/fake-gcs-server
22-
command: ["-scheme", "http", "-port", "4443", "-external-url", "http://[::]:4443", "-backend", "memory"]
21+
# Custom image - see fsouza/fake-gcs-server#1164
22+
image: tustvold/fake-gcs-server
23+
command: ["-scheme", "http", "-public-host", "localhost:4443", "-backend", "memory"]
2324
ports:
2425
- 4443:4443
2526

python/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ doc = false
1818
name = "deltalake._internal"
1919

2020
[dependencies]
21-
arrow-schema = { version = "40", features = ["serde"] }
21+
arrow-schema = { version = "42", features = ["serde"] }
2222
chrono = "0"
2323
env_logger = "0"
2424
futures = "0.3"
@@ -35,7 +35,7 @@ num_cpus = "1"
3535
reqwest = { version = "*", features = ["native-tls-vendored"] }
3636

3737
[dependencies.pyo3]
38-
version = "0.18"
38+
version = "0.19"
3939
features = ["extension-module", "abi3", "abi3-py37"]
4040

4141
[dependencies.deltalake]

python/src/lib.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -775,7 +775,7 @@ fn write_new_deltalake(
775775
Ok(())
776776
}
777777

778-
#[pyclass(name = "DeltaDataChecker", text_signature = "(invariants)")]
778+
#[pyclass(name = "DeltaDataChecker")]
779779
struct PyDeltaDataChecker {
780780
inner: DeltaDataChecker,
781781
rt: tokio::runtime::Runtime,
@@ -784,6 +784,7 @@ struct PyDeltaDataChecker {
784784
#[pymethods]
785785
impl PyDeltaDataChecker {
786786
#[new]
787+
#[pyo3(signature = (invariants))]
787788
fn new(invariants: Vec<(String, String)>) -> Self {
788789
let invariants: Vec<Invariant> = invariants
789790
.into_iter()

python/src/schema.rs

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ fn python_type_to_schema(ob: PyObject, py: Python) -> PyResult<SchemaDataType> {
113113
/// * "decimal(<precision>, <scale>)"
114114
///
115115
/// :param data_type: string representation of the data type
116-
#[pyclass(module = "deltalake.schema", text_signature = "(data_type)")]
116+
#[pyclass(module = "deltalake.schema")]
117117
#[derive(Clone)]
118118
pub struct PrimitiveType {
119119
inner_type: String,
@@ -132,6 +132,7 @@ impl TryFrom<SchemaDataType> for PrimitiveType {
132132
#[pymethods]
133133
impl PrimitiveType {
134134
#[new]
135+
#[pyo3(signature = (data_type))]
135136
fn new(data_type: String) -> PyResult<Self> {
136137
if data_type.starts_with("decimal") {
137138
if try_parse_decimal_type(&data_type).is_none() {
@@ -246,10 +247,7 @@ impl PrimitiveType {
246247
/// ArrayType(PrimitiveType("integer"), contains_null=True)
247248
/// >>> ArrayType("integer", contains_null=False)
248249
/// ArrayType(PrimitiveType("integer"), contains_null=False)
249-
#[pyclass(
250-
module = "deltalake.schema",
251-
text_signature = "(element_type, contains_null=True)"
252-
)]
250+
#[pyclass(module = "deltalake.schema")]
253251
#[derive(Clone)]
254252
pub struct ArrayType {
255253
inner_type: SchemaTypeArray,
@@ -411,10 +409,7 @@ impl ArrayType {
411409
/// MapType(PrimitiveType("integer"), PrimitiveType("string"), value_contains_null=True)
412410
/// >>> MapType("integer", "string", value_contains_null=False)
413411
/// MapType(PrimitiveType("integer"), PrimitiveType("string"), value_contains_null=False)
414-
#[pyclass(
415-
module = "deltalake.schema",
416-
text_signature = "(key_type, value_type, value_contains_null=True)"
417-
)]
412+
#[pyclass(module = "deltalake.schema")]
418413
#[derive(Clone)]
419414
pub struct MapType {
420415
inner_type: SchemaTypeMap,
@@ -597,10 +592,7 @@ impl MapType {
597592
///
598593
/// >>> Field("my_col", "integer", metadata={"custom_metadata": {"test": 2}})
599594
/// Field("my_col", PrimitiveType("integer"), nullable=True, metadata={"custom_metadata": {"test": 2}})
600-
#[pyclass(
601-
module = "deltalake.schema",
602-
text_signature = "(name, type, nullable=True, metadata=None)"
603-
)]
595+
#[pyclass(module = "deltalake.schema")]
604596
#[derive(Clone)]
605597
pub struct Field {
606598
inner: SchemaField,
@@ -778,7 +770,7 @@ impl Field {
778770
///
779771
/// >>> StructType([Field("x", "integer"), Field("y", "string")])
780772
/// StructType([Field(x, PrimitiveType("integer"), nullable=True), Field(y, PrimitiveType("string"), nullable=True)])
781-
#[pyclass(subclass, module = "deltalake.schema", text_signature = "(fields)")]
773+
#[pyclass(subclass, module = "deltalake.schema")]
782774
#[derive(Clone)]
783775
pub struct StructType {
784776
inner_type: SchemaTypeStruct,
@@ -951,13 +943,13 @@ pub fn schema_to_pyobject(schema: &Schema, py: Python) -> PyResult<PyObject> {
951943
/// >>> import pyarrow as pa
952944
/// >>> Schema.from_pyarrow(pa.schema({"x": pa.int32(), "y": pa.string()}))
953945
/// Schema([Field(x, PrimitiveType("integer"), nullable=True), Field(y, PrimitiveType("string"), nullable=True)])
954-
#[pyclass(extends = StructType, name = "Schema", module = "deltalake.schema",
955-
text_signature = "(fields)")]
946+
#[pyclass(extends = StructType, name = "Schema", module = "deltalake.schema")]
956947
pub struct PySchema;
957948

958949
#[pymethods]
959950
impl PySchema {
960951
#[new]
952+
#[pyo3(signature = (fields))]
961953
fn new(fields: Vec<PyRef<Field>>) -> PyResult<(Self, StructType)> {
962954
let fields: Vec<SchemaField> = fields
963955
.into_iter()

rust/Cargo.toml

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ readme = "README.md"
1313
edition = "2021"
1414

1515
[dependencies]
16-
arrow = { version = "40", optional = true }
17-
arrow-array = { version = "40", optional = true }
18-
arrow-buffer = { version = "40", optional = true }
19-
arrow-cast = { version = "40", optional = true }
20-
arrow-ord = { version = "40", optional = true }
21-
arrow-row = { version = "40", optional = true }
22-
arrow-schema = { version = "40", optional = true }
23-
arrow-select = { version = "40", optional = true }
16+
arrow = { version = "42", optional = true }
17+
arrow-array = { version = "42", optional = true }
18+
arrow-buffer = { version = "42", optional = true }
19+
arrow-cast = { version = "42", optional = true }
20+
arrow-ord = { version = "42", optional = true }
21+
arrow-row = { version = "42", optional = true }
22+
arrow-schema = { version = "42", optional = true }
23+
arrow-select = { version = "42", optional = true }
2424
async-trait = "0.1"
2525
bytes = "1"
2626
chrono = { version = "0.4.22", default-features = false, features = ["clock"] }
@@ -38,10 +38,10 @@ libc = ">=0.2.90, <1"
3838
num-bigint = "0.4"
3939
num_cpus = "1"
4040
num-traits = "0.2.15"
41-
object_store = "0.5.6"
41+
object_store = "0.6.1"
4242
once_cell = "1.16.0"
4343
parking_lot = "0.12"
44-
parquet = { version = "40", features = [
44+
parquet = { version = "42", features = [
4545
"async",
4646
"object_store",
4747
], optional = true }
@@ -50,7 +50,7 @@ percent-encoding = "2"
5050
serde = { version = "1", features = ["derive"] }
5151
serde_json = "1"
5252
thiserror = "1"
53-
tokio = { version = "1", features = ["macros", "rt", "parking_lot"] }
53+
tokio = { version = "1.28", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] }
5454
regex = "1"
5555
uuid = { version = "1", features = ["serde", "v4"] }
5656
url = "2.3"
@@ -65,7 +65,7 @@ rusoto_dynamodb = { version = "0.47", default-features = false, optional = true
6565
rusoto_glue = { version = "0.47", default-features = false, optional = true }
6666

6767
# Unity
68-
reqwest = { version = "0.11", default-features = false, features = [
68+
reqwest = { version = "0.11.18", default-features = false, features = [
6969
"rustls-tls",
7070
"json",
7171
], optional = true }
@@ -74,15 +74,15 @@ reqwest-retry = { version = "0.2.2", optional = true }
7474

7575
# Datafusion
7676
dashmap = { version = "5", optional = true }
77-
datafusion = { version = "26", optional = true }
78-
datafusion-expr = { version = "26", optional = true }
79-
datafusion-common = { version = "26", optional = true }
80-
datafusion-proto = { version = "26", optional = true }
81-
datafusion-sql = { version = "26", optional = true }
82-
datafusion-physical-expr = { version = "26", optional = true }
77+
datafusion = { version = "27", optional = true }
78+
datafusion-expr = { version = "27", optional = true }
79+
datafusion-common = { version = "27", optional = true }
80+
datafusion-proto = { version = "27", optional = true }
81+
datafusion-sql = { version = "27", optional = true }
82+
datafusion-physical-expr = { version = "27", optional = true }
8383

8484

85-
sqlparser = { version = "0.34", optional = true }
85+
sqlparser = { version = "0.35", optional = true }
8686

8787
# NOTE dependencies only for integration tests
8888
fs_extra = { version = "1.2.0", optional = true }
@@ -135,7 +135,6 @@ s3-native-tls = [
135135
"rusoto_dynamodb/native-tls",
136136
"dynamodb_lock/native-tls",
137137
"object_store/aws",
138-
"object_store/aws_profile",
139138
]
140139
s3 = [
141140
"rusoto_core/rustls",
@@ -144,7 +143,6 @@ s3 = [
144143
"rusoto_dynamodb/rustls",
145144
"dynamodb_lock/rustls",
146145
"object_store/aws",
147-
"object_store/aws_profile",
148146
]
149147
unity-experimental = ["reqwest", "reqwest-middleware", "reqwest-retry"]
150148

rust/src/action/checkpoints.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ pub async fn cleanup_expired_logs_for(
211211
location: Path::from(""),
212212
last_modified: DateTime::<Utc>::MIN_UTC,
213213
size: 0,
214+
e_tag: None,
214215
},
215216
);
216217
let file_needs_time_adjustment =
@@ -255,6 +256,7 @@ pub async fn cleanup_expired_logs_for(
255256
location: current_file.1.location.clone(),
256257
last_modified: last_file.1.last_modified.add(Duration::seconds(1)),
257258
size: 0,
259+
e_tag: None,
258260
},
259261
);
260262
maybe_delete_files.push(updated);

rust/src/data_catalog/storage/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ impl SchemaProvider for ListingSchemaProvider {
145145
mod tests {
146146
use super::*;
147147
use datafusion::assert_batches_sorted_eq;
148-
use datafusion::catalog::catalog::{CatalogProvider, MemoryCatalogProvider};
148+
use datafusion::catalog::{CatalogProvider, MemoryCatalogProvider};
149149
use datafusion::execution::context::SessionContext;
150150

151151
#[test]

rust/src/delta_datafusion.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,16 +36,16 @@ use arrow_array::StringArray;
3636
use arrow_schema::Field;
3737
use async_trait::async_trait;
3838
use chrono::{DateTime, NaiveDateTime, Utc};
39-
use datafusion::datasource::datasource::TableProviderFactory;
4039
use datafusion::datasource::file_format::{parquet::ParquetFormat, FileFormat};
40+
use datafusion::datasource::physical_plan::FileScanConfig;
41+
use datafusion::datasource::provider::TableProviderFactory;
4142
use datafusion::datasource::{listing::PartitionedFile, MemTable, TableProvider, TableType};
4243
use datafusion::execution::context::{SessionContext, SessionState, TaskContext};
4344
use datafusion::execution::runtime_env::RuntimeEnv;
4445
use datafusion::execution::FunctionRegistry;
4546
use datafusion::optimizer::utils::conjunction;
4647
use datafusion::physical_expr::PhysicalSortExpr;
4748
use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
48-
use datafusion::physical_plan::file_format::FileScanConfig;
4949
use datafusion::physical_plan::filter::FilterExec;
5050
use datafusion::physical_plan::limit::LocalLimitExec;
5151
use datafusion::physical_plan::{
@@ -1377,7 +1377,6 @@ mod tests {
13771377
use arrow::array::StructArray;
13781378
use arrow::datatypes::{DataType, Field, Schema};
13791379
use chrono::{TimeZone, Utc};
1380-
use datafusion::from_slice::FromSlice;
13811380
use datafusion::physical_plan::empty::EmptyExec;
13821381
use datafusion_proto::physical_plan::AsExecutionPlan;
13831382
use datafusion_proto::protobuf;
@@ -1558,6 +1557,7 @@ mod tests {
15581557
location: Path::from("year=2015/month=1/part-00000-4dcb50d3-d017-450c-9df7-a7257dbd3c5d-c000.snappy.parquet".to_string()),
15591558
last_modified: Utc.timestamp_millis_opt(1660497727833).unwrap(),
15601559
size: 10644,
1560+
e_tag: None
15611561
},
15621562
partition_values: [ScalarValue::Int64(Some(2015)), ScalarValue::Int64(Some(1))].to_vec(),
15631563
range: None,
@@ -1575,8 +1575,8 @@ mod tests {
15751575
let batch = RecordBatch::try_new(
15761576
Arc::clone(&schema),
15771577
vec![
1578-
Arc::new(arrow::array::StringArray::from_slice(["a", "b", "c", "d"])),
1579-
Arc::new(arrow::array::Int32Array::from_slice([1, 10, 10, 100])),
1578+
Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c", "d"])),
1579+
Arc::new(arrow::array::Int32Array::from(vec![1, 10, 10, 100])),
15801580
],
15811581
)
15821582
.unwrap();

rust/src/operations/delete.rs

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,6 @@ mod tests {
333333
use arrow::datatypes::{Field, Schema};
334334
use arrow::record_batch::RecordBatch;
335335
use datafusion::assert_batches_sorted_eq;
336-
use datafusion::from_slice::FromSlice;
337336
use datafusion::prelude::*;
338337
use std::sync::Arc;
339338

@@ -358,9 +357,9 @@ mod tests {
358357
let batch = RecordBatch::try_new(
359358
Arc::clone(&schema),
360359
vec![
361-
Arc::new(arrow::array::StringArray::from_slice(["A", "B", "A", "A"])),
362-
Arc::new(arrow::array::Int32Array::from_slice([1, 10, 10, 100])),
363-
Arc::new(arrow::array::StringArray::from_slice([
360+
Arc::new(arrow::array::StringArray::from(vec!["A", "B", "A", "A"])),
361+
Arc::new(arrow::array::Int32Array::from(vec![1, 10, 10, 100])),
362+
Arc::new(arrow::array::StringArray::from(vec![
364363
"2021-02-02",
365364
"2021-02-02",
366365
"2021-02-02",
@@ -411,9 +410,9 @@ mod tests {
411410
let batch = RecordBatch::try_new(
412411
Arc::clone(&schema),
413412
vec![
414-
Arc::new(arrow::array::StringArray::from_slice(["A", "B", "A", "A"])),
415-
Arc::new(arrow::array::Int32Array::from_slice([1, 10, 10, 100])),
416-
Arc::new(arrow::array::StringArray::from_slice([
413+
Arc::new(arrow::array::StringArray::from(vec!["A", "B", "A", "A"])),
414+
Arc::new(arrow::array::Int32Array::from(vec![1, 10, 10, 100])),
415+
Arc::new(arrow::array::StringArray::from(vec![
417416
"2021-02-02",
418417
"2021-02-02",
419418
"2021-02-02",
@@ -435,9 +434,9 @@ mod tests {
435434
let batch = RecordBatch::try_new(
436435
Arc::clone(&schema),
437436
vec![
438-
Arc::new(arrow::array::StringArray::from_slice(["A", "B", "A", "A"])),
439-
Arc::new(arrow::array::Int32Array::from_slice([0, 20, 10, 100])),
440-
Arc::new(arrow::array::StringArray::from_slice([
437+
Arc::new(arrow::array::StringArray::from(vec!["A", "B", "A", "A"])),
438+
Arc::new(arrow::array::Int32Array::from(vec![0, 20, 10, 100])),
439+
Arc::new(arrow::array::StringArray::from(vec![
441440
"2021-02-02",
442441
"2021-02-02",
443442
"2021-02-02",
@@ -586,9 +585,9 @@ mod tests {
586585
let batch = RecordBatch::try_new(
587586
Arc::clone(&schema),
588587
vec![
589-
Arc::new(arrow::array::StringArray::from_slice(["A", "B", "A", "A"])),
590-
Arc::new(arrow::array::Int32Array::from_slice([0, 20, 10, 100])),
591-
Arc::new(arrow::array::StringArray::from_slice([
588+
Arc::new(arrow::array::StringArray::from(vec!["A", "B", "A", "A"])),
589+
Arc::new(arrow::array::Int32Array::from(vec![0, 20, 10, 100])),
590+
Arc::new(arrow::array::StringArray::from(vec![
592591
"2021-02-02",
593592
"2021-02-03",
594593
"2021-02-02",
@@ -644,9 +643,9 @@ mod tests {
644643
let batch = RecordBatch::try_new(
645644
Arc::clone(&schema),
646645
vec![
647-
Arc::new(arrow::array::StringArray::from_slice(["A", "B", "A", "A"])),
648-
Arc::new(arrow::array::Int32Array::from_slice([0, 20, 10, 100])),
649-
Arc::new(arrow::array::StringArray::from_slice([
646+
Arc::new(arrow::array::StringArray::from(vec!["A", "B", "A", "A"])),
647+
Arc::new(arrow::array::Int32Array::from(vec![0, 20, 10, 100])),
648+
Arc::new(arrow::array::StringArray::from(vec![
650649
"2021-02-02",
651650
"2021-02-03",
652651
"2021-02-02",

rust/src/operations/transaction/state.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ use arrow::array::ArrayRef;
44
use arrow::datatypes::{
55
DataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef,
66
};
7+
use datafusion::datasource::physical_plan::wrap_partition_type_in_dict;
78
use datafusion::optimizer::utils::conjunction;
89
use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
9-
use datafusion::physical_plan::file_format::wrap_partition_type_in_dict;
1010
use datafusion_common::config::ConfigOptions;
1111
use datafusion_common::scalar::ScalarValue;
1212
use datafusion_common::{Column, DFSchema, Result as DFResult, TableReference};
@@ -362,6 +362,10 @@ impl ContextProvider for DummyContextProvider {
362362
fn options(&self) -> &ConfigOptions {
363363
&self.options
364364
}
365+
366+
fn get_window_meta(&self, _name: &str) -> Option<Arc<datafusion_expr::WindowUDF>> {
367+
unimplemented!()
368+
}
365369
}
366370

367371
#[cfg(test)]

0 commit comments

Comments
 (0)