Skip to content

Commit e522af2

Browse files
committed
refactor: Use the configuration in HoodieStorageConfig to calculate the estimated proportions
1. Use the configuration in HoodieStorageConfig to calculate the estimated proportions Signed-off-by: TheR1sing3un <[email protected]>
1 parent 98f8b5a commit e522af2

File tree

3 files changed

+5
-16
lines changed

3 files changed

+5
-16
lines changed

hudi-common/src/main/java/org/apache/hudi/common/config/HoodieReaderConfig.java

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121

2222
import javax.annotation.concurrent.Immutable;
2323

24-
import java.util.Map;
25-
2624
/**
2725
* Configurations for reading a file group
2826
*/
@@ -91,15 +89,4 @@ public class HoodieReaderConfig extends HoodieConfig {
9189
"hoodie.write.record.merge.custom.implementation.classes";
9290
public static final String RECORD_MERGE_IMPL_CLASSES_DEPRECATED_WRITE_CONFIG_KEY =
9391
"hoodie.datasource.write.record.merger.impls";
94-
95-
public static final ConfigProperty<Double> LOG_FILE_TO_PARQUET_FORMAT_SIZE_ESTIMATION_FRACTION = ConfigProperty
96-
.key("hoodie.logfile.to.parquet.format.size.estimation.fraction")
97-
.defaultValue(0.80)
98-
.markAdvanced()
99-
.withDocumentation("Estimate the size of the log file in the parquet file format."
100-
+ "For AVRO-encoded log blocks, lower this value to get a more accurate estimate");
101-
102-
public static Double getLogFileToParquetFormatSizeEstimationFraction(Map<String, String> options) {
103-
return Double.parseDouble(options.getOrDefault(LOG_FILE_TO_PARQUET_FORMAT_SIZE_ESTIMATION_FRACTION.key(), LOG_FILE_TO_PARQUET_FORMAT_SIZE_ESTIMATION_FRACTION.defaultValue().toString()));
104-
}
10592
}

hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,8 @@ public class HoodieStorageConfig extends HoodieConfig {
186186
.defaultValue(String.valueOf(0.35))
187187
.markAdvanced()
188188
.withDocumentation("Expected additional compression as records move from log files to parquet. Used for merge_on_read "
189-
+ "table to send inserts into log files & control the size of compacted parquet file.");
189+
+ "table to send inserts into log files & control the size of compacted parquet file."
190+
+ "When encoding log blocks in parquet format, increase this value for a more accurate estimation");
190191

191192
// Configs that control the bloom filter that is written to the file footer
192193
public static final ConfigProperty<String> BLOOM_FILTER_TYPE = ConfigProperty

hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import org.apache.hudi.BaseHoodieTableFileIndex.PartitionPath
2121
import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD}
2222
import org.apache.hudi.HoodieFileIndex.{collectReferencedColumns, convertFilterForTimestampKeyGenerator, getConfigProperties, DataSkippingFailureMode}
2323
import org.apache.hudi.HoodieSparkConfUtils.getConfigValue
24-
import org.apache.hudi.common.config.{HoodieMetadataConfig, HoodieReaderConfig, TypedProperties}
24+
import org.apache.hudi.common.config.{HoodieMetadataConfig, HoodieStorageConfig, TypedProperties}
2525
import org.apache.hudi.common.config.TimestampKeyGeneratorConfig.{TIMESTAMP_INPUT_DATE_FORMAT, TIMESTAMP_OUTPUT_DATE_FORMAT}
2626
import org.apache.hudi.common.model.{FileSlice, HoodieBaseFile, HoodieLogFile}
2727
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
@@ -169,7 +169,8 @@ case class HoodieFileIndex(spark: SparkSession,
169169
val prunedPartitionsAndFilteredFileSlices = filterFileSlices(dataFilters, partitionFilters).map {
170170
case (partitionOpt, fileSlices) =>
171171
if (shouldEmbedFileSlices) {
172-
val logFileEstimationFraction = HoodieReaderConfig.getLogFileToParquetFormatSizeEstimationFraction(options.asJava)
172+
val logFileEstimationFraction = options.getOrElse(HoodieStorageConfig.LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION.key(),
173+
HoodieStorageConfig.LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue()).toDouble
173174
// 1. Generate a disguised representative file for each file slice, which spark uses to optimize rdd partition parallelism based on data such as file size
174175
// For file slice only has base file, we directly use the base file size as representative file size
175176
// For file slice has log file, we estimate the representative file size based on the log file size and option(base file) size

0 commit comments

Comments
 (0)