Skip to content

Commit adf8b39

Browse files
authored
Issue #517: Restore QbeastUtil.computeHistogramForColumn method (#521)
1 parent f9deeeb commit adf8b39

File tree

1 file changed

+52
-0
lines changed

1 file changed

+52
-0
lines changed

src/main/scala/io/qbeast/utils/QbeastUtils.scala

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,4 +217,56 @@ object QbeastUtils extends Logging {
217217

218218
}
219219

220+
/**
221+
* Compute the histogram for a given column
222+
*
223+
* Since computing the histogram can be expensive, this method is used outside the indexing
224+
* process.
225+
*
226+
* It outputs the histogram of the column as format [bin1, bin2, bin3, ...] Number of bins by
227+
* default is 50
228+
*
229+
* For example:
230+
*
231+
* val qbeastTable = QbeastTable.forPath(spark, "path")
232+
*
233+
* val histogram =qbeastTable.computeHistogramForColumn(df, "column")
234+
*
235+
* df.write.format("qbeast").option("columnsToIndex",
236+
* "column:histogram").option("columnStats",histogram).save()
237+
*
238+
* @param df
239+
* @param columnName
240+
*/
241+
@deprecated("Use computeQuantilesForColumn method instead", "0.8.0")
242+
def computeHistogramForColumn(df: DataFrame, columnName: String, numBins: Int = 50): String = {
243+
244+
import df.sparkSession.implicits._
245+
if (!df.columns.contains(columnName)) {
246+
throw AnalysisExceptionFactory.create(s"Column $columnName does not exist in the dataframe")
247+
}
248+
249+
log.info(s"Computing histogram for column $columnName with number of bins $numBins")
250+
val binStarts = "__bin_starts"
251+
val stringPartitionColumn =
252+
MultiDimClusteringFunctions.range_partition_id(col(columnName), numBins)
253+
254+
val histogram = df
255+
.select(columnName)
256+
.distinct()
257+
.na
258+
.drop()
259+
.groupBy(stringPartitionColumn)
260+
.agg(min(columnName).alias(binStarts))
261+
.select(binStarts)
262+
.orderBy(binStarts)
263+
.as[String]
264+
.collect()
265+
266+
log.info(s"Histogram for column $columnName: $histogram")
267+
histogram
268+
.map(string => s"'$string'")
269+
.mkString("[", ",", "]")
270+
}
271+
220272
}

0 commit comments

Comments
 (0)