Skip to content

Commit

Permalink
Issue #517: Restore QbeastUtil.computeHistogramForColumn method (#521)
Browse files Browse the repository at this point in the history
  • Loading branch information
osopardo1 authored Dec 16, 2024
1 parent f9deeeb commit adf8b39
Showing 1 changed file with 52 additions and 0 deletions.
52 changes: 52 additions & 0 deletions src/main/scala/io/qbeast/utils/QbeastUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -217,4 +217,56 @@ object QbeastUtils extends Logging {

}

/**
* Compute the histogram for a given column
*
* Since computing the histogram can be expensive, this method is used outside the indexing
* process.
*
* It outputs the histogram of the column as format [bin1, bin2, bin3, ...] Number of bins by
* default is 50
*
* For example:
*
* val qbeastTable = QbeastTable.forPath(spark, "path")
*
* val histogram =qbeastTable.computeHistogramForColumn(df, "column")
*
* df.write.format("qbeast").option("columnsToIndex",
* "column:histogram").option("columnStats",histogram).save()
*
* @param df
* @param columnName
*/
@deprecated("Use computeQuantilesForColumn method instead", "0.8.0")
def computeHistogramForColumn(df: DataFrame, columnName: String, numBins: Int = 50): String = {

import df.sparkSession.implicits._
if (!df.columns.contains(columnName)) {
throw AnalysisExceptionFactory.create(s"Column $columnName does not exist in the dataframe")
}

log.info(s"Computing histogram for column $columnName with number of bins $numBins")
val binStarts = "__bin_starts"
val stringPartitionColumn =
MultiDimClusteringFunctions.range_partition_id(col(columnName), numBins)

val histogram = df
.select(columnName)
.distinct()
.na
.drop()
.groupBy(stringPartitionColumn)
.agg(min(columnName).alias(binStarts))
.select(binStarts)
.orderBy(binStarts)
.as[String]
.collect()

log.info(s"Histogram for column $columnName: $histogram")
histogram
.map(string => s"'$string'")
.mkString("[", ",", "]")
}

}

0 comments on commit adf8b39

Please sign in to comment.