Skip to content

Commit 8a2cb56

Browse files
authored
Merge branch 'lakesoul-io:main' into main
2 parents 8667644 + 413a913 commit 8a2cb56

File tree

34 files changed

+2376
-1286
lines changed

34 files changed

+2376
-1286
lines changed

.github/workflows/consistency-ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ jobs:
116116
- name: Generate benchmark data and expected query results
117117
run: |
118118
mkdir -p lakesoul/test_files/tpch/data
119-
git clone https://github.com/databricks/tpch-dbgen.git
119+
git clone --branch master --depth 1 https://github.com/databricks/tpch-dbgen.git
120120
cd tpch-dbgen
121121
make
122122
./dbgen -f -s 0.1

.github/workflows/flink-cdc-hdfs-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ jobs:
128128
- name: Start compaction task
129129
run: |
130130
cd ./script/benchmark/work-dir
131-
nohup docker run --cpus 2 -m 5000m --net lakesoul-docker-compose-env_default --rm -t -v $HADOOP_HOME:/opt/hadoop --env HADOOP_HOME=/opt/hadoop -v ${PWD}:/opt/spark/work-dir --env lakesoul_home=/opt/spark/work-dir/lakesoul.properties bitnami/spark:3.3.1 spark-submit --proxy-user flink --driver-memory 2G --executor-memory 2G --conf spark.driver.memoryOverhead=1500m --conf spark.executor.memoryOverhead=1500m --conf spark.hadoop.fs.s3.buffer.dir=/tmp --conf spark.hadoop.fs.s3a.buffer.dir=/tmp --conf spark.hadoop.fs.s3a.fast.upload.buffer=disk --conf spark.hadoop.fs.s3a.fast.upload=true --conf spark.dmetasoul.lakesoul.native.io.enable=true --conf spark.dmetasoul.lakesoul.compaction.level1.file.number.limit=5 --conf spark.dmetasoul.lakesoul.compaction.level1.file.merge.num.limit=2 --class com.dmetasoul.lakesoul.spark.compaction.NewCompactionTask --master local[4] /opt/spark/work-dir/$SPARK_JAR_NAME --threadpool.size=10 --database="" --file_num_limit=5 --file_size_limit=10KB > compaction.log 2>&1 &
131+
nohup docker run --cpus 2 -m 5000m --net lakesoul-docker-compose-env_default --rm -t -v $HADOOP_HOME:/opt/hadoop --env HADOOP_HOME=/opt/hadoop -v ${PWD}:/opt/spark/work-dir --env lakesoul_home=/opt/spark/work-dir/lakesoul.properties bitnami/spark:3.3.1 spark-submit --proxy-user flink --driver-memory 2G --executor-memory 2G --conf spark.driver.memoryOverhead=1500m --conf spark.executor.memoryOverhead=1500m --conf spark.hadoop.fs.s3.buffer.dir=/tmp --conf spark.hadoop.fs.s3a.buffer.dir=/tmp --conf spark.hadoop.fs.s3a.fast.upload.buffer=disk --conf spark.hadoop.fs.s3a.fast.upload=true --conf spark.dmetasoul.lakesoul.native.io.enable=true --conf spark.dmetasoul.lakesoul.compaction.level.file.number.limit=5 --conf spark.dmetasoul.lakesoul.compaction.level.file.merge.num.limit=2 --class com.dmetasoul.lakesoul.spark.compaction.NewCompactionTask --master local[4] /opt/spark/work-dir/$SPARK_JAR_NAME --threadpool.size=10 --database="" --file_num_limit=5 --file_size_limit=10KB > compaction.log 2>&1 &
132132
- name: Start flink mysql cdc task-1
133133
run: |
134134
docker exec -t -u flink lakesoul-docker-compose-env-jobmanager-1 flink run -d -c org.apache.flink.lakesoul.entry.MysqlCdc /opt/flink/work-dir/$FLINK_JAR_NAME --source_db.host mysql --source_db.port 3306 --source_db.db_name test_cdc --source_db.user root --source_db.password root --source.parallelism 2 --sink.parallelism 4 --use.cdc true --warehouse_path hdfs://172.17.0.1:9000/lakesoul-test-bucket/data/ --flink.checkpoint hdfs://172.17.0.1:9000/lakesoul-test-bucket/chk --flink.savepoint hdfs://172.17.0.1:9000/lakesoul-test-bucket/svp --job.checkpoint_interval 5000 --server_time_zone UTC

.github/workflows/flink-cdc-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ jobs:
117117
- name: Start compaction task
118118
run: |
119119
cd ./script/benchmark/work-dir
120-
nohup docker run --cpus 2 -m 5000m --net lakesoul-docker-compose-env_default --rm -t -v ${PWD}:/opt/spark/work-dir --env lakesoul_home=/opt/spark/work-dir/lakesoul.properties bitnami/spark:3.3.1 spark-submit --driver-memory 2G --executor-memory 2G --conf spark.driver.memoryOverhead=1500m --conf spark.executor.memoryOverhead=1500m --conf spark.hadoop.fs.s3.buffer.dir=/tmp --conf spark.hadoop.fs.s3a.buffer.dir=/tmp --conf spark.hadoop.fs.s3.impl=org.apache.hadoop.fs.s3a.S3AFileSystem --conf spark.hadoop.fs.s3a.path.style.access=true --conf spark.hadoop.fs.s3a.endpoint=http://minio:9000 --conf spark.hadoop.fs.s3a.access.key=minioadmin1 --conf spark.hadoop.fs.s3a.secret.key=minioadmin1 --conf spark.sql.warehouse.dir=s3://lakesoul-test-bucket/ --conf spark.hadoop.fs.s3a.fast.upload.buffer=disk --conf spark.hadoop.fs.s3a.fast.upload=true --conf spark.dmetasoul.lakesoul.native.io.enable=true --conf spark.dmetasoul.lakesoul.compaction.level1.file.number.limit=5 --conf spark.dmetasoul.lakesoul.compaction.level1.file.merge.num.limit=2 --class com.dmetasoul.lakesoul.spark.compaction.NewCompactionTask --master local[4] /opt/spark/work-dir/$SPARK_JAR_NAME --threadpool.size=10 --database="" --file_num_limit=5 --file_size_limit=10KB > compaction.log 2>&1 &
120+
nohup docker run --cpus 2 -m 5000m --net lakesoul-docker-compose-env_default --rm -t -v ${PWD}:/opt/spark/work-dir --env lakesoul_home=/opt/spark/work-dir/lakesoul.properties bitnami/spark:3.3.1 spark-submit --driver-memory 2G --executor-memory 2G --conf spark.driver.memoryOverhead=1500m --conf spark.executor.memoryOverhead=1500m --conf spark.hadoop.fs.s3.buffer.dir=/tmp --conf spark.hadoop.fs.s3a.buffer.dir=/tmp --conf spark.hadoop.fs.s3.impl=org.apache.hadoop.fs.s3a.S3AFileSystem --conf spark.hadoop.fs.s3a.path.style.access=true --conf spark.hadoop.fs.s3a.endpoint=http://minio:9000 --conf spark.hadoop.fs.s3a.access.key=minioadmin1 --conf spark.hadoop.fs.s3a.secret.key=minioadmin1 --conf spark.sql.warehouse.dir=s3://lakesoul-test-bucket/ --conf spark.hadoop.fs.s3a.fast.upload.buffer=disk --conf spark.hadoop.fs.s3a.fast.upload=true --conf spark.dmetasoul.lakesoul.native.io.enable=true --conf spark.dmetasoul.lakesoul.compaction.level.file.number.limit=5 --conf spark.dmetasoul.lakesoul.compaction.level.file.merge.num.limit=2 --class com.dmetasoul.lakesoul.spark.compaction.NewCompactionTask --master local[4] /opt/spark/work-dir/$SPARK_JAR_NAME --threadpool.size=10 --database="" --file_num_limit=5 --file_size_limit=10KB > compaction.log 2>&1 &
121121
- name: Start flink mysql cdc task-1
122122
run: |
123123
docker exec -t lakesoul-docker-compose-env-jobmanager-1 flink run -d -c org.apache.flink.lakesoul.entry.MysqlCdc /opt/flink/work-dir/$FLINK_JAR_NAME --source_db.host mysql --source_db.port 3306 --source_db.db_name test_cdc --source_db.user root --source_db.password root --source.parallelism 2 --sink.parallelism 4 --use.cdc true --warehouse_path s3://lakesoul-test-bucket/data/ --flink.checkpoint s3://lakesoul-test-bucket/chk --flink.savepoint s3://lakesoul-test-bucket/svp --job.checkpoint_interval 5000 --server_time_zone UTC

lakesoul-flink/src/main/java/org/apache/flink/lakesoul/tool/FlinkUtil.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,7 @@ public static void setIOConfigs(Configuration conf, NativeIOBase io) {
411411
setFSConf(conf, "fs.s3a.endpoint", "fs.s3a.endpoint", io);
412412
setFSConf(conf, "fs.s3a.endpoint.region", "fs.s3a.endpoint.region", io);
413413
setFSConf(conf, "fs.s3a.path.style.access", "fs.s3a.path.style.access", io);
414+
setFSConf(conf, "fs.s3a.s3.signing-algorithm", "fs.s3a.s3.signing-algorithm", io);
414415
// try flink's s3 credential configs
415416
setFSConf(conf, S3_ACCESS_KEY.key(), "fs.s3a.access.key", io);
416417
setFSConf(conf, S3_SECRET_KEY.key(), "fs.s3a.secret.key", io);

lakesoul-presto/src/main/java/com/facebook/presto/lakesoul/LakeSoulConfig.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ private LakeSoulConfig(Map<String, String> config){
2727
this.region = config.get("fs.s3a.endpoint.region");
2828
this.bucketName = config.get("fs.s3a.bucket");
2929
this.endpoint = config.get("fs.s3a.endpoint");
30+
this.signer = config.get("fs.s3a.s3.signing-algorithm");
3031
this.defaultFS = config.get("fs.defaultFS");
3132
this.user = config.get("fs.hdfs.user");
3233
this.virtualPathStyle = Boolean.parseBoolean(config.getOrDefault("fs.s3a.path.style.access", "false"));
@@ -38,6 +39,7 @@ private LakeSoulConfig(Map<String, String> config){
3839
private String region;
3940
private String bucketName;
4041
private String endpoint;
42+
private String signer;
4143
private String user;
4244
private String defaultFS;
4345
private String timeZone;
@@ -84,6 +86,14 @@ public void setEndpoint(String endpoint) {
8486
this.endpoint = endpoint;
8587
}
8688

89+
public String getSigner() {
90+
return signer;
91+
}
92+
93+
public void setSigner(String signer) {
94+
this.signer = signer;
95+
}
96+
8797
public String getUser() {
8898
return user;
8999
}

lakesoul-presto/src/main/java/com/facebook/presto/lakesoul/LakeSoulPageSource.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ public LakeSoulPageSource(LakeSoulSplit split, ArrowBlockBuilder arrowBlockBuild
101101
LakeSoulConfig.getInstance().getRegion(),
102102
LakeSoulConfig.getInstance().getBucketName(),
103103
LakeSoulConfig.getInstance().getEndpoint(),
104+
LakeSoulConfig.getInstance().getSigner(),
104105
LakeSoulConfig.getInstance().getDefaultFS(),
105106
LakeSoulConfig.getInstance().getUser(),
106107
LakeSoulConfig.getInstance().isVirtualPathStyle()

lakesoul-presto/src/main/java/com/facebook/presto/lakesoul/LakeSoulRecordCursor.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ public LakeSoulRecordCursor(LakeSoulRecordSet recordSet) throws IOException {
9898
LakeSoulConfig.getInstance().getRegion(),
9999
LakeSoulConfig.getInstance().getBucketName(),
100100
LakeSoulConfig.getInstance().getEndpoint(),
101+
LakeSoulConfig.getInstance().getSigner(),
101102
LakeSoulConfig.getInstance().getDefaultFS(),
102103
LakeSoulConfig.getInstance().getUser(),
103104
LakeSoulConfig.getInstance().isVirtualPathStyle()

lakesoul-spark/pom.xml

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,10 @@ SPDX-License-Identifier: Apache-2.0
140140
<groupId>com.google.protobuf</groupId>
141141
<artifactId>protobuf-java</artifactId>
142142
</exclusion>
143+
<exclusion>
144+
<groupId>org.apache.hadoop</groupId>
145+
<artifactId>*</artifactId>
146+
</exclusion>
143147
</exclusions>
144148
</dependency>
145149
<!-- https://mvnrepository.com/artifact/io.jhdf/jhdf -->
@@ -161,6 +165,10 @@ SPDX-License-Identifier: Apache-2.0
161165
<groupId>com.google.protobuf</groupId>
162166
<artifactId>protobuf-java</artifactId>
163167
</exclusion>
168+
<exclusion>
169+
<groupId>org.apache.hadoop</groupId>
170+
<artifactId>*</artifactId>
171+
</exclusion>
164172
</exclusions>
165173
</dependency>
166174

@@ -189,6 +197,16 @@ SPDX-License-Identifier: Apache-2.0
189197
<artifactId>spark-sql-kafka-0-10_2.12</artifactId>
190198
<version>${spark.version}</version>
191199
<scope>${local.scope}</scope>
200+
<exclusions>
201+
<exclusion>
202+
<groupId>com.google.protobuf</groupId>
203+
<artifactId>protobuf-java</artifactId>
204+
</exclusion>
205+
<exclusion>
206+
<groupId>org.apache.hadoop</groupId>
207+
<artifactId>*</artifactId>
208+
</exclusion>
209+
</exclusions>
192210
</dependency>
193211
<dependency>
194212
<groupId>org.apache.kafka</groupId>
@@ -260,6 +278,10 @@ SPDX-License-Identifier: Apache-2.0
260278
<groupId>com.fasterxml.jackson.core</groupId>
261279
<artifactId>*</artifactId>
262280
</exclusion>
281+
<exclusion>
282+
<groupId>org.apache.hadoop</groupId>
283+
<artifactId>*</artifactId>
284+
</exclusion>
263285
</exclusions>
264286
</dependency>
265287

@@ -298,13 +320,29 @@ SPDX-License-Identifier: Apache-2.0
298320
<version>${spark.version}</version>
299321
<scope>test</scope>
300322
<classifier>tests</classifier>
323+
<exclusions>
324+
<exclusion>
325+
<groupId>com.google.protobuf</groupId>
326+
<artifactId>protobuf-java</artifactId>
327+
</exclusion>
328+
<exclusion>
329+
<groupId>org.apache.hadoop</groupId>
330+
<artifactId>*</artifactId>
331+
</exclusion>
332+
</exclusions>
301333
</dependency>
302334
<dependency>
303335
<groupId>org.apache.spark</groupId>
304336
<artifactId>spark-hive_${scala.binary.version}</artifactId>
305337
<version>${spark.version}</version>
306338
<scope>test</scope>
307339
<classifier>tests</classifier>
340+
<exclusions>
341+
<exclusion>
342+
<groupId>org.apache.hadoop</groupId>
343+
<artifactId>*</artifactId>
344+
</exclusion>
345+
</exclusions>
308346
</dependency>
309347

310348
<dependency>
@@ -318,7 +356,19 @@ SPDX-License-Identifier: Apache-2.0
318356
<dependency>
319357
<groupId>org.apache.hadoop</groupId>
320358
<artifactId>hadoop-aws</artifactId>
321-
<version>3.3.6</version>
359+
<version>3.3.4</version>
360+
<scope>provided</scope>
361+
</dependency>
362+
<dependency>
363+
<groupId>org.apache.hadoop</groupId>
364+
<artifactId>hadoop-client-api</artifactId>
365+
<version>3.3.4</version>
366+
<scope>provided</scope>
367+
</dependency>
368+
<dependency>
369+
<groupId>org.apache.hadoop</groupId>
370+
<artifactId>hadoop-client-runtime</artifactId>
371+
<version>3.3.4</version>
322372
<scope>provided</scope>
323373
</dependency>
324374

@@ -333,6 +383,7 @@ SPDX-License-Identifier: Apache-2.0
333383
<groupId>org.apache.spark</groupId>
334384
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
335385
<version>${spark.version}</version>
386+
<scope>provided</scope>
336387
</dependency>
337388

338389
<!-- for test only. we don't rely on gluten during package and runtime -->

0 commit comments

Comments
 (0)